diff --git a/opencl/test/unit_test/command_queue/CMakeLists.txt b/opencl/test/unit_test/command_queue/CMakeLists.txt index e158cfd912..bfd9a3a768 100644 --- a/opencl/test/unit_test/command_queue/CMakeLists.txt +++ b/opencl/test/unit_test/command_queue/CMakeLists.txt @@ -94,6 +94,9 @@ set(IGDRCL_SRCS_tests_command_queue if(TESTS_XEHP_PLUS) list(APPEND IGDRCL_SRCS_tests_command_queue + ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_tests_xehp_plus.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_media_kernel_xehp_plus.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_resource_barier_tests_xehp_plus.cpp ${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_plus.cpp ) endif() diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index 6b2ef66d06..4b10de2cb6 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -8,6 +8,7 @@ #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/helpers/array_count.h" #include "shared/source/helpers/basic_math.h" +#include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/memory_manager.h" @@ -30,6 +31,7 @@ #include "opencl/test/unit_test/fixtures/dispatch_flags_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h" #include "opencl/test/unit_test/fixtures/memory_management_fixture.h" +#include "opencl/test/unit_test/fixtures/multi_tile_fixture.h" #include "opencl/test/unit_test/helpers/raii_hw_helper.h" #include "opencl/test/unit_test/libult/ult_command_stream_receiver.h" #include "opencl/test/unit_test/mocks/mock_allocation_properties.h" @@ -1684,3 +1686,229 @@ HWTEST_F(CommandQueueOnSpecificEngineTests, givenNotInitializedCcsOsContextWhenC ASSERT_EQ(&osContext, queue.gpgpuEngine->osContext); EXPECT_TRUE(osContext.isInitialized()); } + +TEST_F(MultiTileFixture, givenSubDeviceWhenQueueIsCreatedThenItContainsProperDevice) { + auto tile0 = platform()->getClDevice(0)->getDeviceById(0); + + const cl_device_id deviceId = tile0; + auto returnStatus = CL_SUCCESS; + auto context = clCreateContext(nullptr, 1, &deviceId, nullptr, nullptr, &returnStatus); + EXPECT_EQ(CL_SUCCESS, returnStatus); + EXPECT_NE(nullptr, context); + + auto commandQueue = clCreateCommandQueueWithProperties(context, tile0, nullptr, &returnStatus); + EXPECT_EQ(CL_SUCCESS, returnStatus); + EXPECT_NE(nullptr, commandQueue); + + auto neoQueue = castToObject(commandQueue); + EXPECT_EQ(&tile0->getDevice(), &neoQueue->getDevice()); + + clReleaseCommandQueue(commandQueue); + clReleaseContext(context); +} + +TEST_F(MultiTileFixture, givenTile1WhenQueueIsCreatedThenItContainsTile1Device) { + auto tile1 = platform()->getClDevice(0)->getDeviceById(1); + + const cl_device_id deviceId = tile1; + auto returnStatus = CL_SUCCESS; + auto context = clCreateContext(nullptr, 1, &deviceId, nullptr, nullptr, &returnStatus); + EXPECT_EQ(CL_SUCCESS, returnStatus); + EXPECT_NE(nullptr, context); + + auto commandQueue = clCreateCommandQueueWithProperties(context, tile1, nullptr, &returnStatus); + EXPECT_EQ(CL_SUCCESS, returnStatus); + EXPECT_NE(nullptr, commandQueue); + + auto neoQueue = castToObject(commandQueue); + EXPECT_EQ(&tile1->getDevice(), &neoQueue->getDevice()); + + clReleaseCommandQueue(commandQueue); + clReleaseContext(context); +} + +struct CopyOnlyQueueTests : ::testing::Test { + void SetUp() override { + typeUsageRcs.first = EngineHelpers::remapEngineTypeToHwSpecific(typeUsageRcs.first, *defaultHwInfo); + + auto device = MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get()); + if (device->engineGroups[static_cast(EngineGroupType::Copy)].empty()) { + GTEST_SKIP(); + } + device->engineGroups.clear(); + device->engineGroups.resize(static_cast(EngineGroupType::MaxEngineGroups)); + device->engines.clear(); + + device->createEngine(0, typeUsageRcs); + device->createEngine(1, typeUsageBcs); + bcsEngine = &device->getEngines().back(); + + clDevice = std::make_unique(device); + + context = std::make_unique(clDevice.get()); + + properties[1] = device->getIndexOfNonEmptyEngineGroup(EngineGroupType::Copy); + } + + EngineTypeUsage typeUsageBcs = EngineTypeUsage{aub_stream::EngineType::ENGINE_BCS, EngineUsage::Regular}; + EngineTypeUsage typeUsageRcs = EngineTypeUsage{aub_stream::EngineType::ENGINE_RCS, EngineUsage::Regular}; + + std::unique_ptr clDevice{}; + std::unique_ptr context{}; + std::unique_ptr queue{}; + const EngineControl *bcsEngine = nullptr; + + cl_queue_properties properties[5] = {CL_QUEUE_FAMILY_INTEL, 0, CL_QUEUE_INDEX_INTEL, 0, 0}; +}; + +TEST_F(CopyOnlyQueueTests, givenBcsSelectedWhenCreatingCommandQueueThenItIsCopyOnly) { + MockCommandQueue queue{context.get(), clDevice.get(), properties, false}; + EXPECT_EQ(bcsEngine->commandStreamReceiver, queue.getBcsCommandStreamReceiver()); + EXPECT_NE(nullptr, queue.timestampPacketContainer); + EXPECT_TRUE(queue.isCopyOnly); +} + +HWTEST_F(CopyOnlyQueueTests, givenBcsSelectedWhenEnqueuingCopyThenBcsIsUsed) { + auto srcBuffer = std::unique_ptr{BufferHelper<>::create(context.get())}; + auto dstBuffer = std::unique_ptr{BufferHelper<>::create(context.get())}; + MockCommandQueueHw queue{context.get(), clDevice.get(), properties}; + auto commandStream = &bcsEngine->commandStreamReceiver->getCS(1024); + + auto usedCommandStream = commandStream->getUsed(); + cl_int retVal = queue.enqueueCopyBuffer( + srcBuffer.get(), + dstBuffer.get(), + 0, + 0, + 1, + 0, + nullptr, + nullptr); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(usedCommandStream, commandStream->getUsed()); +} + +HWTEST_F(CopyOnlyQueueTests, givenBlitterEnabledWhenCreatingBcsCommandQueueThenReturnSuccess) { + DebugManagerStateRestore restore{}; + DebugManager.flags.EnableBlitterOperationsSupport.set(1); + + cl_int retVal{}; + auto commandQueue = clCreateCommandQueueWithProperties(context.get(), clDevice.get(), properties, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, commandQueue); + EXPECT_EQ(CL_SUCCESS, clReleaseCommandQueue(commandQueue)); +} + +using MultiEngineQueueHwTests = ::testing::Test; + +HWCMDTEST_F(IGFX_XE_HP_CORE, MultiEngineQueueHwTests, givenQueueFamilyPropertyWhenQueueIsCreatedThenSelectValidEngine) { + initPlatform(); + HardwareInfo localHwInfo = *defaultHwInfo; + + localHwInfo.featureTable.ftrCCSNode = true; + + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&localHwInfo)); + MockContext context(device.get()); + context.contextType = ContextType::CONTEXT_TYPE_UNRESTRICTIVE; + + bool ccsFound = false; + for (auto &engine : device->engines) { + if (engine.osContext->getEngineType() == aub_stream::EngineType::ENGINE_CCS) { + ccsFound = true; + break; + } + } + + struct CommandQueueTestValues { + CommandQueueTestValues() = delete; + CommandQueueTestValues(cl_queue_properties engineFamily, cl_queue_properties engineIndex, aub_stream::EngineType expectedEngine) + : expectedEngine(expectedEngine) { + properties[1] = engineFamily; + properties[3] = engineIndex; + }; + + cl_command_queue clCommandQueue = nullptr; + CommandQueue *commandQueueObj = nullptr; + cl_queue_properties properties[5] = {CL_QUEUE_FAMILY_INTEL, 0, CL_QUEUE_INDEX_INTEL, 0, 0}; + aub_stream::EngineType expectedEngine; + }; + auto addTestValueIfAvailable = [&](std::vector &vec, EngineGroupType engineGroup, cl_queue_properties queueIndex, aub_stream::EngineType engineType, bool csEnabled) { + if (csEnabled) { + const auto familyIndex = device->getDevice().getIndexOfNonEmptyEngineGroup(engineGroup); + vec.push_back(CommandQueueTestValues(static_cast(familyIndex), queueIndex, engineType)); + } + }; + auto retVal = CL_SUCCESS; + const auto &ccsInstances = localHwInfo.gtSystemInfo.CCSInfo.Instances.Bits; + std::vector commandQueueTestValues; + addTestValueIfAvailable(commandQueueTestValues, EngineGroupType::RenderCompute, 0, EngineHelpers::remapEngineTypeToHwSpecific(aub_stream::EngineType::ENGINE_RCS, device->getHardwareInfo()), true); + addTestValueIfAvailable(commandQueueTestValues, EngineGroupType::Compute, 0, aub_stream::ENGINE_CCS, ccsFound); + addTestValueIfAvailable(commandQueueTestValues, EngineGroupType::Compute, 1, aub_stream::ENGINE_CCS1, ccsInstances.CCS1Enabled); + addTestValueIfAvailable(commandQueueTestValues, EngineGroupType::Compute, 2, aub_stream::ENGINE_CCS2, ccsInstances.CCS2Enabled); + addTestValueIfAvailable(commandQueueTestValues, EngineGroupType::Compute, 3, aub_stream::ENGINE_CCS3, ccsInstances.CCS3Enabled); + + for (auto &commandQueueTestValue : commandQueueTestValues) { + if (commandQueueTestValue.properties[1] >= HwHelper::getGpgpuEnginesCount(device->getHardwareInfo())) { + continue; + } + commandQueueTestValue.clCommandQueue = clCreateCommandQueueWithProperties(&context, device.get(), + &commandQueueTestValue.properties[0], &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + commandQueueTestValue.commandQueueObj = castToObject(commandQueueTestValue.clCommandQueue); + + auto &cmdQueueEngine = commandQueueTestValue.commandQueueObj->getGpgpuCommandStreamReceiver().getOsContext().getEngineType(); + EXPECT_EQ(commandQueueTestValue.expectedEngine, cmdQueueEngine); + + clReleaseCommandQueue(commandQueueTestValue.commandQueueObj); + } +} + +TEST_F(MultiTileFixture, givenDefaultContextWithRootDeviceWhenQueueIsCreatedThenQueueIsMultiEngine) { + auto rootDevice = platform()->getClDevice(0); + MockContext context(rootDevice); + context.contextType = ContextType::CONTEXT_TYPE_DEFAULT; + + auto rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver; + + MockCommandQueue queue(&context, rootDevice, nullptr, false); + ASSERT_NE(nullptr, queue.gpgpuEngine); + EXPECT_EQ(rootCsr->isMultiOsContextCapable(), queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()); + EXPECT_EQ(rootCsr, queue.gpgpuEngine->commandStreamReceiver); +} + +TEST_F(MultiTileFixture, givenDefaultContextWithSubdeviceWhenQueueIsCreatedThenQueueIsNotMultiEngine) { + auto subdevice = platform()->getClDevice(0)->getDeviceById(0); + MockContext context(subdevice); + context.contextType = ContextType::CONTEXT_TYPE_DEFAULT; + + MockCommandQueue queue(&context, subdevice, nullptr, false); + ASSERT_NE(nullptr, queue.gpgpuEngine); + EXPECT_FALSE(queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()); +} + +TEST_F(MultiTileFixture, givenUnrestrictiveContextWithRootDeviceWhenQueueIsCreatedThenQueueIsMultiEngine) { + auto rootDevice = platform()->getClDevice(0); + MockContext context(rootDevice); + context.contextType = ContextType::CONTEXT_TYPE_UNRESTRICTIVE; + + auto rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver; + + MockCommandQueue queue(&context, rootDevice, nullptr, false); + ASSERT_NE(nullptr, queue.gpgpuEngine); + EXPECT_EQ(rootCsr->isMultiOsContextCapable(), queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()); + EXPECT_EQ(rootCsr, queue.gpgpuEngine->commandStreamReceiver); +} + +TEST_F(MultiTileFixture, givenNotDefaultContextWithRootDeviceAndTileIdMaskWhenQueueIsCreatedThenQueueIsMultiEngine) { + auto rootClDevice = platform()->getClDevice(0); + auto rootDevice = static_cast(&rootClDevice->getDevice()); + MockContext context(rootClDevice); + context.contextType = ContextType::CONTEXT_TYPE_UNRESTRICTIVE; + + auto rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver; + + MockCommandQueue queue(&context, rootClDevice, nullptr, false); + ASSERT_NE(nullptr, queue.gpgpuEngine); + EXPECT_EQ(rootCsr->isMultiOsContextCapable(), queue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()); + EXPECT_EQ(rootCsr, queue.gpgpuEngine->commandStreamReceiver); +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_plus.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_plus.cpp new file mode 100644 index 0000000000..b37266a53f --- /dev/null +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_plus.cpp @@ -0,0 +1,1677 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/walker_partition_xehp_plus.h" +#include "shared/source/command_stream/linear_stream.h" +#include "shared/source/gmm_helper/gmm_helper.h" +#include "shared/source/helpers/hw_helper.h" +#include "shared/source/helpers/timestamp_packet.h" +#include "shared/source/indirect_heap/indirect_heap.h" +#include "shared/source/memory_manager/unified_memory_manager.h" +#include "shared/source/os_interface/os_interface.h" +#include "shared/source/utilities/tag_allocator.h" +#include "shared/test/common/cmd_parse/hw_parse.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/variable_backup.h" +#include "shared/test/common/mocks/mock_device.h" +#include "shared/test/common/mocks/mock_execution_environment.h" + +#include "opencl/source/built_ins/builtins_dispatch_builder.h" +#include "opencl/source/command_queue/gpgpu_walker.h" +#include "opencl/source/command_queue/hardware_interface.h" +#include "opencl/source/helpers/dispatch_info_builder.h" +#include "opencl/source/kernel/kernel.h" +#include "opencl/test/unit_test/command_stream/linear_stream_fixture.h" +#include "opencl/test/unit_test/mocks/mock_allocation_properties.h" +#include "opencl/test/unit_test/mocks/mock_cl_device.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_kernel.h" +#include "opencl/test/unit_test/mocks/mock_mdi.h" +#include "opencl/test/unit_test/mocks/mock_platform.h" +#include "opencl/test/unit_test/mocks/mock_timestamp_container.h" + +using namespace NEO; + +using WalkerDispatchTest = ::testing::Test; + +struct XeHPPlusDispatchWalkerBasicFixture : public LinearStreamFixture { + void SetUp() override { + LinearStreamFixture::SetUp(); + memset(globalOffsets, 0, sizeof(globalOffsets)); + memset(startWorkGroups, 0, sizeof(startWorkGroups)); + + localWorkSizesIn[0] = 16; + localWorkSizesIn[1] = localWorkSizesIn[2] = 1; + numWorkGroups[0] = numWorkGroups[1] = numWorkGroups[2] = 1; + simd = 16; + + device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get(), rootDeviceIndex)); + context = std::make_unique(device.get()); + kernel = std::make_unique(*device, context.get()); + sizeGrf = device->getHardwareInfo().capabilityTable.grfSize; + sizeGrfDwords = sizeGrf / sizeof(uint32_t); + + for (uint32_t i = 0; i < sizeGrfDwords; i++) { + crossThreadDataGrf[i] = i; + crossThreadDataTwoGrf[i] = i + 2; + } + for (uint32_t i = sizeGrfDwords; i < sizeGrfDwords * 2; i++) { + crossThreadDataTwoGrf[i] = i + 2; + } + } + + DebugManagerStateRestore restore; + + size_t globalOffsets[3]; + size_t startWorkGroups[3]; + size_t numWorkGroups[3]; + size_t localWorkSizesIn[3]; + uint32_t simd; + uint32_t sizeGrf; + uint32_t sizeInlineData; + uint32_t sizeGrfDwords; + uint32_t crossThreadDataGrf[16]; + uint32_t crossThreadDataTwoGrf[32]; + + const uint32_t rootDeviceIndex = 1u; + std::unique_ptr device; + std::unique_ptr context; + std::unique_ptr kernel; +}; + +using XeHPPlusDispatchWalkerBasicTest = Test; + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenWorkDimOneThenLocalWorkSizeEqualsLocalXDim) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + auto localWorkSize = GpgpuWalkerHelper::setGpgpuWalkerThreadData( + computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizesIn, simd, 3, true, false, 5u); + EXPECT_EQ(localWorkSizesIn[0], localWorkSize); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + EXPECT_EQ(0u, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); + EXPECT_EQ(0u, computeWalker->getWalkOrder()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenWorkDimTwoThenLocalWorkSizeEqualsProductLocalXandYDim) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + localWorkSizesIn[1] = 8; + + auto localWorkSize = GpgpuWalkerHelper::setGpgpuWalkerThreadData( + computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizesIn, simd, 3, true, false, 0u); + EXPECT_EQ(localWorkSizesIn[0] * localWorkSizesIn[1], localWorkSize); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + EXPECT_EQ(0u, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenWorkDimThreeThenLocalWorkSizeEqualsProductLocalXandYandZDim) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + localWorkSizesIn[1] = 8; + localWorkSizesIn[2] = 2; + + auto localWorkSize = GpgpuWalkerHelper::setGpgpuWalkerThreadData( + computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizesIn, simd, 3, true, false, 0u); + EXPECT_EQ(localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2], localWorkSize); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + EXPECT_EQ(0u, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimOneWhenAskHwForLocalIdsThenExpectGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({1, 0, 0}); + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 1, false, false, 4u); + + auto localX = static_cast(computeWalker->getLocalXMaximum() + 1); + auto localY = static_cast(computeWalker->getLocalYMaximum() + 1); + auto localZ = static_cast(computeWalker->getLocalZMaximum() + 1); + EXPECT_EQ(localWorkSizesIn[0], localX); + EXPECT_EQ(localWorkSizesIn[1], localY); + EXPECT_EQ(localWorkSizesIn[2], localZ); + + constexpr uint32_t expectedEmit = (1 << 0); + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(1u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); + EXPECT_EQ(4u, computeWalker->getWalkOrder()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimTwoWhenOnlyYIdPresentAskHwForLocalIdsThenExpectGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({0, 1, 0}); + localWorkSizesIn[1] = 16; + localWorkSizesIn[0] = localWorkSizesIn[2] = 1; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 2, false, false, 0u); + + auto localX = static_cast(computeWalker->getLocalXMaximum() + 1); + auto localY = static_cast(computeWalker->getLocalYMaximum() + 1); + auto localZ = static_cast(computeWalker->getLocalZMaximum() + 1); + EXPECT_EQ(localWorkSizesIn[0], localX); + EXPECT_EQ(localWorkSizesIn[1], localY); + EXPECT_EQ(localWorkSizesIn[2], localZ); + + constexpr uint32_t expectedEmit = (1 << 1); + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(1u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkThreeTwoWhenOnlyZIdPresentAskHwForLocalIdsThenExpectGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({0, 0, 1}); + localWorkSizesIn[2] = 16; + localWorkSizesIn[0] = localWorkSizesIn[1] = 1; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 2, false, false, 0u); + + auto localX = static_cast(computeWalker->getLocalXMaximum() + 1); + auto localY = static_cast(computeWalker->getLocalYMaximum() + 1); + auto localZ = static_cast(computeWalker->getLocalZMaximum() + 1); + EXPECT_EQ(localWorkSizesIn[0], localX); + EXPECT_EQ(localWorkSizesIn[1], localY); + EXPECT_EQ(localWorkSizesIn[2], localZ); + + constexpr uint32_t expectedEmit = (1 << 2); + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(1u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenDifferentSIMDsizesWhenLocalIdsGeneratedThenMessageSizeIsSetToProperValue) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({0, 0, 1}); + localWorkSizesIn[2] = 16; + localWorkSizesIn[0] = localWorkSizesIn[1] = 1; + + uint32_t simdProgramming[3][2] = {{32, 2}, {16, 1}, {8, 0}}; // {given, expected} + bool walkerInput[4][2] = {{false, false}, {true, false}, {false, true}, {true, true}}; // {runtime local ids, inline data} + + for (uint32_t i = 0; i < 4; i++) { + for (uint32_t j = 0; j < 3; j++) { + *computeWalker = FamilyType::cmdInitGpgpuWalker; + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simdProgramming[j][0], 2, + walkerInput[i][0], walkerInput[i][1], 0u); + EXPECT_EQ(simdProgramming[j][1], computeWalker->getMessageSimd()); + } + } +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimTwoWhenAskHwForLocalIdsThenExpectGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({1, 1, 0}); + localWorkSizesIn[1] = 8; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 2, false, false, 0u); + + auto localX = static_cast(computeWalker->getLocalXMaximum() + 1); + auto localY = static_cast(computeWalker->getLocalYMaximum() + 1); + auto localZ = static_cast(computeWalker->getLocalZMaximum() + 1); + EXPECT_EQ(localWorkSizesIn[0], localX); + EXPECT_EQ(localWorkSizesIn[1], localY); + EXPECT_EQ(localWorkSizesIn[2], localZ); + + constexpr uint32_t expectedEmit = (1 << 0) | (1 << 1); + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(1u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimThreeWhenAskHwForLocalIdsThenExpectGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({1, 1, 1}); + localWorkSizesIn[1] = 8; + localWorkSizesIn[2] = 2; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 3, false, false, 0u); + auto localX = static_cast(computeWalker->getLocalXMaximum() + 1); + auto localY = static_cast(computeWalker->getLocalYMaximum() + 1); + auto localZ = static_cast(computeWalker->getLocalZMaximum() + 1); + EXPECT_EQ(localWorkSizesIn[0], localX); + EXPECT_EQ(localWorkSizesIn[1], localY); + EXPECT_EQ(localWorkSizesIn[2], localZ); + + constexpr uint32_t expectedEmit = (1 << 0) | (1 << 1) | (1 << 2); + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(1u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimThreeWhenAskHwForLocalIdsAndNoLocalIdsUsedThenExpectNoGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({0, 0, 0}); + localWorkSizesIn[1] = 8; + localWorkSizesIn[2] = 2; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 3, false, false, 0u); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + constexpr uint32_t expectedEmit = 0; + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimThreeWhenNotAskHwForLocalIdsAndLocalIdsUsedThenExpectNoGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({1, 1, 1}); + localWorkSizesIn[1] = 8; + localWorkSizesIn[2] = 2; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 3, true, false, 0u); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + constexpr uint32_t expectedEmit = 0; + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(0u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimOneWhenAskForInlineDataAndNoLocalIdsPresentThenExpectOnlyInlineFieldSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 1, true, true, 0u); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + EXPECT_EQ(0u, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(1u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimOneWhenAskForInlineDataAndLocalIdsPresentThenExpectInlineAndDoNotExpectEmitLocalIdFieldSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({1, 0, 0}); + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 1, true, true, 0u); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + constexpr uint32_t expectedEmit = 0u; + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(1u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimThreeWhenAskForInlineDataAndLocalIdsPresentThenDoNotExpectEmitLocalIdFieldSetButExpectInlineSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({1, 1, 1}); + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 3, true, true, 0u); + + EXPECT_EQ(0u, computeWalker->getLocalXMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalYMaximum()); + EXPECT_EQ(0u, computeWalker->getLocalZMaximum()); + + constexpr uint32_t expectedEmit = 0u; + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(0u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(1u, computeWalker->getEmitInlineParameter()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenWorkDimThreeWhenAskHwForLocalIdsAndInlineDataThenExpectGenerationFieldsSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + COMPUTE_WALKER *computeWalker = static_cast(linearStream.getSpace(sizeof(COMPUTE_WALKER))); + *computeWalker = FamilyType::cmdInitGpgpuWalker; + + kernel->kernelInfo.setLocalIds({1, 1, 1}); + localWorkSizesIn[1] = 8; + localWorkSizesIn[2] = 2; + + GpgpuWalkerHelper::setGpgpuWalkerThreadData(computeWalker, kernel->kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, + localWorkSizesIn, simd, 3, false, true, 5u); + auto localX = static_cast(computeWalker->getLocalXMaximum() + 1); + auto localY = static_cast(computeWalker->getLocalYMaximum() + 1); + auto localZ = static_cast(computeWalker->getLocalZMaximum() + 1); + EXPECT_EQ(localWorkSizesIn[0], localX); + EXPECT_EQ(localWorkSizesIn[1], localY); + EXPECT_EQ(localWorkSizesIn[2], localZ); + + constexpr uint32_t expectedEmit = (1 << 0) | (1 << 1) | (1 << 2); + EXPECT_EQ(expectedEmit, computeWalker->getEmitLocalId()); + EXPECT_EQ(1u, computeWalker->getGenerateLocalId()); + EXPECT_EQ(1u, computeWalker->getEmitInlineParameter()); + EXPECT_EQ(5u, computeWalker->getWalkOrder()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenTimestampPacketWhenDispatchingThenProgramPostSyncData) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + + MockKernelWithInternals kernel1(*device); + MockKernelWithInternals kernel2(*device); + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + + TimestampPacketContainer timestampPacketContainer; + timestampPacketContainer.add(device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + timestampPacketContainer.add(device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + + MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector({kernel1.mockKernel, kernel2.mockKernel})); + + MockCommandQueue cmdQ(context.get(), device.get(), nullptr, false); + auto &cmdStream = cmdQ.getCS(0); + + HardwareInterface::dispatchWalker( + cmdQ, + multiDispatchInfo, + CsrDependencies(), + nullptr, + nullptr, + nullptr, + nullptr, + ×tampPacketContainer, + false); + + HardwareParse hwParser; + hwParser.parseCommands(cmdStream, 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto gmmHelper = device->getGmmHelper(); + auto expectedMocs = gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(FamilyType::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walker->getPostSync().getOperation()); + EXPECT_TRUE(walker->getPostSync().getDataportPipelineFlush()); + EXPECT_EQ(expectedMocs, walker->getPostSync().getMocs()); + auto contextStartAddress = TimestampPacketHelper::getContextStartGpuAddress(*timestampPacketContainer.peekNodes()[0]); + EXPECT_EQ(contextStartAddress, walker->getPostSync().getDestinationAddress()); + + auto secondWalkerItor = find(++hwParser.itorWalker, hwParser.cmdList.end()); + auto secondWalker = genCmdCast(*secondWalkerItor); + + EXPECT_EQ(FamilyType::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, secondWalker->getPostSync().getOperation()); + EXPECT_TRUE(secondWalker->getPostSync().getDataportPipelineFlush()); + EXPECT_EQ(expectedMocs, walker->getPostSync().getMocs()); + contextStartAddress = TimestampPacketHelper::getContextStartGpuAddress(*timestampPacketContainer.peekNodes()[1]); + EXPECT_EQ(contextStartAddress, secondWalker->getPostSync().getDestinationAddress()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenDebugVariableEnabledWhenEnqueueingThenWriteWalkerStamp) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + DebugManager.flags.EnableTimestampPacket.set(true); + + auto testDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext testContext(testDevice.get()); + auto cmdQ = std::make_unique>(&testContext, testDevice.get(), nullptr); + MockKernelWithInternals testKernel(*testDevice, &testContext); + + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(testKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + EXPECT_NE(nullptr, cmdQ->timestampPacketContainer.get()); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + + auto gmmHelper = device->getGmmHelper(); + auto expectedMocs = gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED); + + auto &postSyncData = walker->getPostSync(); + EXPECT_EQ(FamilyType::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, + postSyncData.getOperation()); + EXPECT_TRUE(postSyncData.getDataportPipelineFlush()); + EXPECT_EQ(expectedMocs, postSyncData.getMocs()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenDebugVariableEnabledWhenMocsValueIsOverwrittenThenPostSyncContainsProperSetting) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + auto mocsValue = 8u; + DebugManager.flags.EnableTimestampPacket.set(true); + DebugManager.flags.OverridePostSyncMocs.set(mocsValue); + + auto testDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext testContext(testDevice.get()); + auto cmdQ = std::make_unique>(&testContext, testDevice.get(), nullptr); + MockKernelWithInternals testKernel(*testDevice, &testContext); + + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(testKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + EXPECT_NE(nullptr, cmdQ->timestampPacketContainer.get()); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + + auto &postSyncData = walker->getPostSync(); + EXPECT_EQ(mocsValue, postSyncData.getMocs()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenTimestampPacketWriteEnabledWhenEstimatingStreamSizeThenAddEnoughSpace) { + MockCommandQueueHw cmdQ(context.get(), device.get(), nullptr); + MockKernelWithInternals kernel1(*device); + MockKernelWithInternals kernel2(*device); + MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector({kernel1.mockKernel, kernel2.mockKernel})); + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + auto sizeWithDisabled = cmdQ.requestedCmdStreamSize; + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + getCommandStream(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false); + auto sizeWithEnabled = cmdQ.requestedCmdStreamSize; + + EXPECT_EQ(sizeWithEnabled, sizeWithDisabled + 0); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenDebugVariableEnabledWhenEnqueueingThenWritePostsyncOperationInImmWriteMode) { + DebugManager.flags.UseImmDataWriteModeOnPostSyncOperation.set(true); + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + auto &postSyncData = walker->getPostSync(); + EXPECT_EQ(FamilyType::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_IMMEDIATE_DATA, + postSyncData.getOperation()); + auto contextEndAddress = TimestampPacketHelper::getContextEndGpuAddress(*cmdQ->timestampPacketContainer->peekNodes()[0]); + EXPECT_EQ(contextEndAddress, postSyncData.getDestinationAddress()); + EXPECT_EQ(0x2'0000'0002u, postSyncData.getImmediateData()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenDebugVariableEnabledWhenEnqueueingThenSystolicIsProgrammed) { + DebugManager.flags.OverrideSystolicInComputeWalker.set(true); + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_TRUE(walker->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenAutoLocalIdsGenerationEnabledWhenDispatchMeetCriteriaThenExpectNoLocalIdsAndProperIsaAddress) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + auto &commandStream = cmdQ->getCS(1024); + auto usedBeforeCS = commandStream.getUsed(); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.entryPoints.skipPerThreadDataLoad = 128; + kd.kernelAttributes.localId[0] = 1; + kd.kernelAttributes.localId[1] = 0; + kd.kernelAttributes.localId[2] = 0; + kd.kernelAttributes.numLocalIdChannels = 1; + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {16, 1, 1}; + size_t lws[] = {16, 1, 1}; + size_t globalOffsets[] = {0, 0, 0}; + + MultiDispatchInfo multiDispatchInfo(kernel->mockKernel); + DispatchInfoBuilder builder(*device); + builder.setDispatchGeometry(1, gws, lws, globalOffsets); + builder.setKernel(kernel->mockKernel); + builder.bake(multiDispatchInfo); + + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + auto usedAfterCS = commandStream.getUsed(); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(COMPUTE_WALKER::DWORD_LENGTH_FIXED_SIZE, walker->getDwordLength()); + EXPECT_EQ(0u, walker->getEmitInlineParameter()); + + EXPECT_EQ(1u, walker->getGenerateLocalId()); + EXPECT_EQ(1u, walker->getEmitLocalId()); + uint32_t expectedIndirectDataLength = alignUp(kernel->mockKernel->getCrossThreadDataSize(), COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); + + INTERFACE_DESCRIPTOR_DATA &idd = walker->getInterfaceDescriptor(); + uint64_t expectedKernelStartOffset = kernel->mockKernel->getKernelInfo().getGraphicsAllocation()->getGpuAddressToPatch() + + kernel->kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad; + + EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); + EXPECT_EQ((uint32_t)(expectedKernelStartOffset >> 32), idd.getKernelStartPointerHigh()); + + auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, + false, *cmdQ.get(), multiDispatchInfo, false, false); + expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); + expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); + EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); + + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenPassInlineDataEnabledWhenLocalIdsUsedThenDoNotExpectCrossThreadDataInWalkerEmitLocalFieldSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using INLINE_DATA = typename FamilyType::INLINE_DATA; + + DebugManager.flags.EnablePassInlineData.set(true); + DebugManager.flags.EnableHwGenerationLocalIds.set(0); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + auto &commandStream = cmdQ->getCS(1024); + auto usedBeforeCS = commandStream.getUsed(); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.kernelAttributes.flags.passInlineData = true; + kd.kernelAttributes.localId[0] = 1; + kd.kernelAttributes.localId[1] = 0; + kd.kernelAttributes.localId[2] = 0; + kd.kernelAttributes.numLocalIdChannels = 1; + + kernel->mockKernel->setCrossThreadData(crossThreadDataGrf, sizeof(INLINE_DATA)); + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {16, 1, 1}; + size_t lws[] = {16, 1, 1}; + size_t globalOffsets[] = {0, 0, 0}; + + MultiDispatchInfo multiDispatchInfo(kernel->mockKernel); + DispatchInfoBuilder builder(*device); + builder.setDispatchGeometry(1, gws, lws, globalOffsets); + builder.setKernel(kernel->mockKernel); + builder.bake(multiDispatchInfo); + + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + auto usedAfterCS = commandStream.getUsed(); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(1u, walker->getEmitInlineParameter()); + + EXPECT_EQ(0u, walker->getGenerateLocalId()); + constexpr uint32_t expectedEmit = 0u; + EXPECT_EQ(expectedEmit, walker->getEmitLocalId()); + EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadDataGrf, sizeof(INLINE_DATA))); + + uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize(); + //only X is present + auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf, 1); + sizePerThreadData = std::max(sizePerThreadData, sizeGrf); + size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData; + + uint32_t expectedIndirectDataLength = alignUp(static_cast(perThreadTotalDataSize), COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); + + INTERFACE_DESCRIPTOR_DATA &idd = walker->getInterfaceDescriptor(); + uint64_t expectedKernelStartOffset = kernel->mockKernel->getKernelInfo().getGraphicsAllocation()->getGpuAddressToPatch(); + + EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); + EXPECT_EQ((uint32_t)(expectedKernelStartOffset >> 32), idd.getKernelStartPointerHigh()); + + auto expectedSizeCS = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false, + false, *cmdQ.get(), multiDispatchInfo, false, false); + expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END); + expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize); + EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS); + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenExecutionMaskWithoutReminderWhenProgrammingWalkerThenSetValidNumberOfBitsInMask) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + std::array testedSimd = {{1, 8, 16, 32}}; + + for (auto simd : testedSimd) { + kernel->kernelInfo.kernelDescriptor.kernelAttributes.simdSize = simd; + + auto kernelSimd = kernel->mockKernel->getKernelInfo().getMaxSimdSize(); + EXPECT_EQ(simd, kernelSimd); + + size_t gws[] = {kernelSimd, 1, 1}; + size_t lws[] = {kernelSimd, 1, 1}; + + auto streamOffset = cmdQ->getCS(0).getUsed(); + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), streamOffset); + hwParser.findHardwareCommands(); + + auto walker = genCmdCast(*hwParser.itorWalker); + if (simd == 1) { + EXPECT_EQ(maxNBitValue(32), walker->getExecutionMask()); + } else { + EXPECT_EQ(maxNBitValue(simd), walker->getExecutionMask()); + } + } +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenPassInlineDataEnabledWhenLocalIdsUsedAndCrossThreadIsTwoGrfsThenExpectFirstCrossThreadDataInWalkerSecondInPayloadWithPerThread) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using INLINE_DATA = typename FamilyType::INLINE_DATA; + + DebugManager.flags.EnablePassInlineData.set(true); + DebugManager.flags.EnableHwGenerationLocalIds.set(false); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + IndirectHeap &ih = cmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 2048); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.kernelAttributes.flags.passInlineData = true; + kd.kernelAttributes.localId[0] = 1; + kd.kernelAttributes.localId[1] = 0; + kd.kernelAttributes.localId[2] = 0; + kd.kernelAttributes.numLocalIdChannels = 1; + + kernel->mockKernel->setCrossThreadData(crossThreadDataTwoGrf, sizeof(INLINE_DATA) * 2); + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {16, 1, 1}; + size_t lws[] = {16, 1, 1}; + size_t globalOffsets[] = {0, 0, 0}; + + MultiDispatchInfo multiDispatchInfo(kernel->mockKernel); + DispatchInfoBuilder builder(*device); + builder.setDispatchGeometry(1, gws, lws, globalOffsets); + builder.setKernel(kernel->mockKernel); + builder.bake(multiDispatchInfo); + + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(1u, walker->getEmitInlineParameter()); + + EXPECT_EQ(0u, walker->getGenerateLocalId()); + constexpr uint32_t expectedEmit = 0u; + EXPECT_EQ(expectedEmit, walker->getEmitLocalId()); + EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadDataTwoGrf, sizeof(INLINE_DATA))); + void *payloadData = ih.getCpuBase(); + EXPECT_EQ(0, memcmp(payloadData, &crossThreadDataTwoGrf[sizeof(INLINE_DATA) / sizeof(uint32_t)], sizeof(INLINE_DATA))); + + uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize(); + //only X is present + uint32_t localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, sizeGrf, 1); + localIdSizePerThread = std::max(localIdSizePerThread, sizeGrf); + auto sizePerThreadData = getThreadsPerWG(simd, lws[0]) * localIdSizePerThread; + + auto crossThreadDataSize = kernel->mockKernel->getCrossThreadDataSize(); + crossThreadDataSize -= std::min(static_cast(sizeof(INLINE_DATA)), crossThreadDataSize); + + //second GRF in indirect + uint32_t expectedIndirectDataLength = static_cast(sizePerThreadData + crossThreadDataSize); + expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); + + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenPassInlineDataEnabledWhenNoLocalIdsUsedThenExpectCrossThreadDataInWalkerAndNoEmitLocalFieldSet) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using INLINE_DATA = typename FamilyType::INLINE_DATA; + + DebugManager.flags.EnablePassInlineData.set(true); + DebugManager.flags.EnableHwGenerationLocalIds.set(false); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.kernelAttributes.flags.passInlineData = true; + kd.kernelAttributes.localId[0] = 0; + kd.kernelAttributes.localId[1] = 0; + kd.kernelAttributes.localId[2] = 0; + kd.kernelAttributes.numLocalIdChannels = 0; + + kernel->mockKernel->setCrossThreadData(crossThreadDataGrf, sizeof(INLINE_DATA)); + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {16, 1, 1}; + size_t lws[] = {16, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(1u, walker->getEmitInlineParameter()); + + EXPECT_EQ(0u, walker->getGenerateLocalId()); + EXPECT_EQ(0u, walker->getEmitLocalId()); + + EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadDataGrf, sizeof(INLINE_DATA))); + + uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize(); + //only X is present + auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, 1); + sizePerThreadData = std::max(sizePerThreadData, sizeGrf); + size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData; + uint32_t expectedIndirectDataLength = static_cast(perThreadTotalDataSize); + expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); + + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenPassInlineDataEnabledWhenNoLocalIdsUsedAndCrossThreadIsTwoGrfsThenExpectFirstCrossThreadDataInWalkerSecondInPayload) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using INLINE_DATA = typename FamilyType::INLINE_DATA; + + DebugManager.flags.EnablePassInlineData.set(true); + DebugManager.flags.EnableHwGenerationLocalIds.set(false); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + IndirectHeap &ih = cmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 2048); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.kernelAttributes.flags.passInlineData = true; + kd.kernelAttributes.localId[0] = 0; + kd.kernelAttributes.localId[1] = 0; + kd.kernelAttributes.localId[2] = 0; + kd.kernelAttributes.numLocalIdChannels = 0; + + kernel->mockKernel->setCrossThreadData(crossThreadDataTwoGrf, sizeof(INLINE_DATA) * 2); + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {16, 1, 1}; + size_t lws[] = {16, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(1u, walker->getEmitInlineParameter()); + + EXPECT_EQ(0u, walker->getGenerateLocalId()); + EXPECT_EQ(0u, walker->getEmitLocalId()); + + EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadDataTwoGrf, sizeof(INLINE_DATA))); + void *payloadData = ih.getCpuBase(); + EXPECT_EQ(0, memcmp(payloadData, &crossThreadDataTwoGrf[sizeof(INLINE_DATA) / sizeof(uint32_t)], sizeof(INLINE_DATA))); + + uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize(); + //only X is present + auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, 1); + sizePerThreadData = std::max(sizePerThreadData, sizeGrf); + size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData; + + //second GRF in indirect + uint32_t expectedIndirectDataLength = static_cast(perThreadTotalDataSize + sizeof(INLINE_DATA)); + expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); + + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenAllChannelsActiveWithWorkDimOneDimensionThenHwGenerationIsEnabledWithOverwrittenWalkOrder) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + + DebugManager.flags.EnableHwGenerationLocalIds.set(true); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.kernelAttributes.flags.passInlineData = true; + kd.kernelAttributes.localId[0] = 1; + kd.kernelAttributes.localId[1] = 1; + kd.kernelAttributes.localId[2] = 1; + kd.kernelAttributes.numLocalIdChannels = 3; + + kernel->mockKernel->setCrossThreadData(crossThreadDataTwoGrf, sizeGrf * 2); + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {4000, 1, 1}; + size_t lws[] = {40, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + + EXPECT_EQ(1u, walker->getGenerateLocalId()); + EXPECT_EQ(7u, walker->getEmitLocalId()); + EXPECT_EQ(4u, walker->getWalkOrder()); + + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenPassInlineDataAndHwLocalIdsGenerationEnabledWhenLocalIdsUsedThenExpectCrossThreadDataInWalkerAndEmitFields) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using INLINE_DATA = typename FamilyType::INLINE_DATA; + + DebugManager.flags.EnablePassInlineData.set(true); + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.entryPoints.skipPerThreadDataLoad = 128; + kd.kernelAttributes.flags.passInlineData = true; + kd.kernelAttributes.localId[0] = 1; + kd.kernelAttributes.localId[1] = 0; + kd.kernelAttributes.localId[2] = 0; + kd.kernelAttributes.numLocalIdChannels = 1; + + kernel->mockKernel->setCrossThreadData(crossThreadDataGrf, sizeof(INLINE_DATA)); + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {16, 1, 1}; + size_t lws[] = {16, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(1u, walker->getEmitInlineParameter()); + + EXPECT_EQ(1u, walker->getGenerateLocalId()); + constexpr uint32_t expectedEmit = (1 << 0); + EXPECT_EQ(expectedEmit, walker->getEmitLocalId()); + + EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadDataGrf, sizeof(INLINE_DATA))); + + constexpr uint32_t expectedIndirectDataLength = 0; + EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); + + INTERFACE_DESCRIPTOR_DATA &idd = walker->getInterfaceDescriptor(); + uint64_t expectedKernelStartOffset = kernel->mockKernel->getKernelInfo().getGraphicsAllocation()->getGpuAddressToPatch() + + kernel->kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad; + + EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); + EXPECT_EQ((uint32_t)(expectedKernelStartOffset >> 32), idd.getKernelStartPointerHigh()); + + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenPassInlineDataAndHwLocalIdsGenerationEnabledWhenLocalIdsNotUsedThenExpectCrossThreadDataInWalkerAndNoHwLocalIdGeneration) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using INLINE_DATA = typename FamilyType::INLINE_DATA; + + DebugManager.flags.EnablePassInlineData.set(true); + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + auto &kd = kernel->kernelInfo.kernelDescriptor; + kd.entryPoints.skipPerThreadDataLoad = 128; + kd.kernelAttributes.flags.passInlineData = true; + kd.kernelAttributes.localId[0] = 0; + kd.kernelAttributes.localId[1] = 0; + kd.kernelAttributes.localId[2] = 0; + kd.kernelAttributes.numLocalIdChannels = 0; + + kernel->mockKernel->setCrossThreadData(crossThreadDataGrf, sizeof(INLINE_DATA)); + + auto memoryManager = device->getUltCommandStreamReceiver().getMemoryManager(); + kernel->kernelInfo.kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); + + size_t gws[] = {16, 1, 1}; + size_t lws[] = {16, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(cmdQ->getCS(0), 0); + hwParser.findHardwareCommands(); + EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end()); + + auto walker = genCmdCast(*hwParser.itorWalker); + EXPECT_EQ(1u, walker->getEmitInlineParameter()); + + EXPECT_EQ(0u, walker->getGenerateLocalId()); + constexpr uint32_t expectedEmit = 0; + EXPECT_EQ(expectedEmit, walker->getEmitLocalId()); + + EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadDataGrf, sizeof(INLINE_DATA))); + + constexpr uint32_t expectedIndirectDataLength = 0; + EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength()); + + INTERFACE_DESCRIPTOR_DATA &idd = walker->getInterfaceDescriptor(); + uint64_t expectedKernelStartOffset = kernel->mockKernel->getKernelInfo().getGraphicsAllocation()->getGpuAddressToPatch(); + + EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer()); + EXPECT_EQ((uint32_t)(expectedKernelStartOffset >> 32), idd.getKernelStartPointerHigh()); + + memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenWalkerPartitionIsOnThenSizeIsProperlyEstimated) { + DebugManager.flags.EnableWalkerPartition.set(1u); + UltClDeviceFactory deviceFactory{1, 2}; + MockClDevice *device = deviceFactory.rootDevices[0]; + MockContext context{device}; + + auto synchronizeBeforeExecution = false; + auto staticPartitioning = false; + auto cmdQ = std::make_unique>(&context, device, nullptr); + auto &csr = cmdQ->getUltCommandStreamReceiver(); + + size_t numPipeControls = MemorySynchronizationCommands::isPipeControlWArequired(device->getHardwareInfo()) ? 2 : 1; + + auto baseSize = sizeof(typename FamilyType::COMPUTE_WALKER) + + (sizeof(typename FamilyType::PIPE_CONTROL) * numPipeControls) + + HardwareCommandsHelper::getSizeRequiredCS() + + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(kernel->kernelInfo.heapInfo.KernelHeapSize); + + DispatchInfo dispatchInfo{}; + dispatchInfo.setNumberOfWorkgroups({32, 1, 1}); + + synchronizeBeforeExecution = false; + DebugManager.flags.SynchronizeWalkerInWparidMode.set(0); + staticPartitioning = false; + csr.staticWorkPartitioningEnabled = false; + auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false); + auto returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); + + synchronizeBeforeExecution = false; + DebugManager.flags.SynchronizeWalkerInWparidMode.set(0); + staticPartitioning = true; + csr.staticWorkPartitioningEnabled = true; + partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false); + returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); + + synchronizeBeforeExecution = true; + DebugManager.flags.SynchronizeWalkerInWparidMode.set(1); + staticPartitioning = false; + csr.staticWorkPartitioningEnabled = false; + partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false); + returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); + + synchronizeBeforeExecution = true; + DebugManager.flags.SynchronizeWalkerInWparidMode.set(1); + staticPartitioning = true; + csr.staticWorkPartitioningEnabled = true; + partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false); + returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenWalkerPartitionIsDisabledThenSizeIsProperlyEstimated) { + DebugManager.flags.EnableWalkerPartition.set(0u); + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + size_t numPipeControls = MemorySynchronizationCommands::isPipeControlWArequired(device->getHardwareInfo()) ? 2 : 1; + + DispatchInfo dispatchInfo{}; + dispatchInfo.setNumberOfWorkgroups({32, 1, 1}); + + auto baseSize = sizeof(typename FamilyType::COMPUTE_WALKER) + + (sizeof(typename FamilyType::PIPE_CONTROL) * numPipeControls) + + HardwareCommandsHelper::getSizeRequiredCS() + + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(kernel->kernelInfo.heapInfo.KernelHeapSize); + + auto returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, baseSize); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenQueueIsMultiEngineCapableThenWalkerPartitionsAreEstimated) { + DebugManager.flags.EnableWalkerPartition.set(1u); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + size_t numPipeControls = MemorySynchronizationCommands::isPipeControlWArequired(device->getHardwareInfo()) ? 2 : 1; + + auto baseSize = sizeof(typename FamilyType::COMPUTE_WALKER) + + (sizeof(typename FamilyType::PIPE_CONTROL) * numPipeControls) + + HardwareCommandsHelper::getSizeRequiredCS() + + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(kernel->kernelInfo.heapInfo.KernelHeapSize); + + auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, 16u, false, false, false, false); + + DispatchInfo dispatchInfo{}; + dispatchInfo.setNumberOfWorkgroups({32, 1, 1}); + + auto returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenProgramWalkerIsCalledThenWalkerPartitionLogicIsExecuted) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + + DebugManager.flags.EnableWalkerPartition.set(1u); + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {2, 1, 1}; + size_t lws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenProgramWalkerIsCalledAndForceSynchronizeWalkerInWpariModeThenWalkerPartitionLogicIsExecuted) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + + DebugManager.flags.EnableWalkerPartition.set(1u); + DebugManager.flags.SynchronizeWalkerInWparidMode.set(1); + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {2, 1, 1}; + size_t lws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenPartitioningIsNotUsed) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + + struct SingleSubdeviceKernel : public MockKernel { + using MockKernel::MockKernel; + bool isSingleSubdevicePreferred() const override { return true; } + }; + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {2, 1, 1}; + size_t lws[] = {1, 1, 1}; + SingleSubdeviceKernel subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device); + cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenProgramWalkerIsCalledWithPartitionLogicDisabledThenWalkerPartitionLogicIsNotExecuted) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + + DebugManager.flags.EnableWalkerPartition.set(0u); + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {2, 1, 1}; + size_t lws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenQueueIsCreatedWithMultiEngineSupportAndEnqueueIsDoneThenWalkerIsPartitioned) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + + DebugManager.flags.EnableWalkerPartition.set(1u); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {128, 1, 1}; + size_t lws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); + EXPECT_EQ(64u, computeWalker->getPartitionSize()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenProgramWalkerIsCalledWithDebugRegistryOverridesThenWalkerContainsProperParameters) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + DebugManager.flags.EnableWalkerPartition.set(1u); + DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(2u); + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(2u); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + auto timestampPacket = cmdQ->timestampPacketContainer->peekNodes().at(0); + auto expectedPartitionCount = timestampPacket->getPacketsUsed(); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, computeWalker->getPartitionType()); + EXPECT_EQ(1u, computeWalker->getPartitionSize()); + EXPECT_EQ(expectedPartitionCount, static_cast(DebugManager.flags.ExperimentalSetWalkerPartitionCount.get())); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenProgramWalkerIsCalledWithDebugRegistryOverridesToPartitionCountOneThenProgramProperParameters) { + DebugManager.flags.EnableWalkerPartition.set(1u); + DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(1u); + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(2u); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType()); + EXPECT_EQ(0u, computeWalker->getPartitionSize()); + EXPECT_FALSE(computeWalker->getWorkloadPartitionEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenThereIsNoLocalMemorySupportThenDoNotPartition) { + DebugManager.flags.EnableWalkerPartition.set(1u); + DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(2u); + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(2u); + VariableBackup backup(&OSInterface::osEnableLocalMemory, false); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType()); + EXPECT_EQ(0u, computeWalker->getPartitionSize()); + EXPECT_FALSE(computeWalker->getWorkloadPartitionEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenEnqueueIsBlockedOnUserEventThenDoNotPartition) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + DebugManager.flags.EnableWalkerPartition.set(1u); + DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(2u); + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(2u); + + cl_event userEvent = clCreateUserEvent(context.get(), nullptr); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 1, &userEvent, nullptr); + clSetUserEventStatus(userEvent, 0u); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ->getUltCommandStreamReceiver().lastFlushedCommandStream); + hwParser.findHardwareCommands(&cmdQ->getGpgpuCommandStreamReceiver().getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0)); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, computeWalker->getPartitionType()); + EXPECT_EQ(1u, computeWalker->getPartitionSize()); + EXPECT_TRUE(computeWalker->getWorkloadPartitionEnable()); + + clReleaseEvent(userEvent); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, whenDispatchProfilingCalledThenDoNothing) { + MockCommandQueue cmdQ(context.get(), device.get(), nullptr, false); + + auto &cmdStream = cmdQ.getCS(0); + MockTagAllocator timeStampAllocator(device->getRootDeviceIndex(), device->getMemoryManager(), 10, + MemoryConstants::cacheLineSize, sizeof(HwTimeStamps), false, device->getDeviceBitfield()); + + auto hwTimeStamp1 = timeStampAllocator.getTag(); + + GpgpuWalkerHelper::dispatchProfilingCommandsStart(*hwTimeStamp1, &cmdStream, device->getHardwareInfo()); + + GpgpuWalkerHelper::dispatchProfilingCommandsEnd(*hwTimeStamp1, &cmdStream, device->getHardwareInfo()); + + EXPECT_EQ(0u, cmdStream.getUsed()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusDispatchWalkerBasicTest, givenOpenClWhenEnqueuePartitionWalkerThenExpectNoNativeCrossTileSyncCleanup) { + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + DebugManager.flags.EnableWalkerPartition.set(1u); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {128, 1, 1}; + size_t lws[] = {8, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); + EXPECT_EQ(8u, computeWalker->getPartitionSize()); + + GenCmdList storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(0u, storeDataImmList.size()); +} + +using NonDefaultPlatformGpuWalkerTest = XeHPPlusDispatchWalkerBasicTest; + +HWCMDTEST_F(IGFX_XE_HP_CORE, NonDefaultPlatformGpuWalkerTest, givenNonDefaultPlatformWhenSetupTimestampPacketThenGmmHelperIsTakenFromNonDefaultPlatform) { + auto executionEnvironment = std::make_unique(); + auto rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get(); + rootDeviceEnvironment->initGmm(); + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {1, 1, 1}; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + auto &cmdStream = cmdQ->getCS(0); + TagNode> timestamp; + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + + platformsImpl->clear(); + EXPECT_EQ(platform(), nullptr); + GpgpuWalkerHelper::setupTimestampPacket(&cmdStream, computeWalker, static_cast(×tamp), *rootDeviceEnvironment); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenDefaultLocalIdsGenerationWhenPassingFittingParametersThenReturnFalse) { + uint32_t workDim = 1; + uint32_t simd = 8; + size_t lws[3] = {16, 1, 1}; + std::array walkOrder = {}; + uint32_t requiredWalkOrder = 0u; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenEnabledLocalIdsGenerationWhenPassingFittingOneDimParametersThenReturnFalse) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 1; + uint32_t simd = 8; + size_t lws[3] = {16, 1, 1}; + std::array walkOrder = {{0, 1, 2}}; + uint32_t requiredWalkOrder = 4u; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); + + lws[0] = 15; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenEnabledLocalIdsGenerationWhenPassingFittingTwoDimParametersThenReturnFalse) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 2; + uint32_t simd = 8; + size_t lws[3] = {16, 16, 1}; + std::array walkOrder = {{1, 0, 2}}; + uint32_t requiredWalkOrder = 77u; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired(workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(2u, requiredWalkOrder); + + lws[0] = 15; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(2u, requiredWalkOrder); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenWalkOrderThatNeedsToBeFollowedWithCompatibleDimSizesArePassedThenRuntimeGenerationIsNotRequired) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 3; + uint32_t simd = 8; + size_t lws[3] = {200, 1, 1}; + std::array walkOrder = {{2, 1, 0}}; + uint32_t requiredWalkOrder = 77u; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(5u, requiredWalkOrder); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenLocalWorkgroupSizeGreaterThen1024ThenRuntimeMustGenerateLocalIds) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 3; + uint32_t simd = 8; + std::array lws = {1025, 1, 1}; + + std::array walkOrder = {{0, 1, 2}}; + uint32_t requiredWalkOrder = 77u; + + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + + lws = {1, 1, 1025}; + + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + + lws = {32, 32, 4}; + + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + + workDim = 2; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenWalkOrderThatDoesntNeedToBeFollowedWhenIncompatibleDimSizesArePassedThenRuntimeGenerationIsReuqired) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 3; + uint32_t simd = 8; + std::array lws = {200, 1, 1}; + + std::array walkOrder = {{0, 2, 1}}; + uint32_t requiredWalkOrder = 77u; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(4u, requiredWalkOrder); + + lws = {16, 17, 2}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(1u, requiredWalkOrder); + + lws = {16, 2, 17}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); + + lws = {17, 2, 17}; + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + + lws = {3, 4, 32}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(4u, requiredWalkOrder); + + workDim = 2; + lws = {17, 2, 17}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(2u, requiredWalkOrder); + + lws = {2, 17, 17}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); + + lws = {2, 4, 17}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); + + workDim = 1; + lws = {17, 2, 17}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); + + workDim = 1; + lws = {2, 17, 17}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenDisabledLocalIdsGenerationWhenPassingFittingThreeDimParametersThenReturnTrue) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(0); + + uint32_t workDim = 3; + uint32_t simd = 8; + size_t lws[3] = {16, 16, 4}; + + std::array walkOrder = {{1, 0, 2}}; + uint32_t requiredWalkOrder = 77u; + + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenEnabledLocalIdsGenerationWhenPassingFittingThreeDimParametersThenReturnFalseAndProperWalkOrder) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 3; + uint32_t simd = 8; + size_t lws[3] = {16, 16, 2}; + std::array walkOrder = {{2, 1, 0}}; + uint32_t requiredWalkOrder = 77u; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(5u, requiredWalkOrder); + + walkOrder = {2, 0, 1}; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(3u, requiredWalkOrder); + + walkOrder = {1, 2, 0}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(4u, requiredWalkOrder); + + walkOrder = {1, 0, 2}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(2u, requiredWalkOrder); + + walkOrder = {0, 2, 1}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(1u, requiredWalkOrder); + + walkOrder = {0, 1, 2}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(0u, requiredWalkOrder); + + //incorrect walkOrder returns 6 + walkOrder = {2, 2, 0}; + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + EXPECT_EQ(6u, requiredWalkOrder); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenEnabledLocalIdsGenerationWhenPassingInvalidLwsTwoDimParametersThenReturnTrue) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 2; + uint32_t simd = 8; + size_t lws[3] = {15, 15, 1}; + + std::array walkOrder = {{0, 1, 2}}; + uint32_t requiredWalkOrder = 4u; + + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenEnabledLocalIdsGenerationWhenPassingInvalidLwsThreeDimParametersThenReturnTrue) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 3; + uint32_t simd = 8; + size_t lws[3] = {16, 15, 15}; + std::array walkOrder = {{0, 1, 2}}; + uint32_t requiredWalkOrder = 4u; + + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); + + lws[0] = 15; + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws, walkOrder, true, requiredWalkOrder, simd)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenSimdSize1TWhenCheckToGeneratHwIdsThenReturnedFalse) { + DebugManagerStateRestore restore; + DebugManager.flags.EnableHwGenerationLocalIds.set(1); + + uint32_t workDim = 3; + uint32_t simd = 8; + std::array lws = {200, 1, 1}; + + std::array walkOrder = {{0, 2, 1}}; + uint32_t requiredWalkOrder = 77u; + + EXPECT_FALSE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); + simd = 1; + EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); +} diff --git a/opencl/test/unit_test/command_queue/enqueue_media_kernel_xehp_plus.cpp b/opencl/test/unit_test/command_queue/enqueue_media_kernel_xehp_plus.cpp new file mode 100644 index 0000000000..928a4ece76 --- /dev/null +++ b/opencl/test/unit_test/command_queue/enqueue_media_kernel_xehp_plus.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/pipeline_select_helper.h" +#include "shared/test/common/cmd_parse/gen_cmd_parse.h" + +#include "opencl/test/unit_test/fixtures/media_kernel_fixture.h" +#include "opencl/test/unit_test/libult/ult_command_stream_receiver.h" +#include "test.h" + +using namespace NEO; +typedef MediaKernelFixture MediaKernelTest; + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueBlockedVmeKernelFirstTimeThenProgramPipelineSelectionAndMediaSampler) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {1, 1, 1}; + + UserEvent userEvent(context); + cl_event blockedEvent = &userEvent; + + auto retVal = pCmdQ->enqueueKernel( + pVmeKernel, + workDim, + globalWorkOffset, + globalWorkSize, + nullptr, + 1, + &blockedEvent, + nullptr); + ASSERT_EQ(CL_SUCCESS, retVal); + + userEvent.setStatus(CL_COMPLETE); + + parseCommands(*pCmdQ); + ASSERT_NE(cmdPipelineSelect, nullptr); + auto *pCmd = genCmdCast(cmdPipelineSelect); + + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_FALSE(pCmd->getMediaSamplerDopClockGateEnable()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); + pCmdQ->releaseVirtualEvent(); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueBlockedNonVmeKernelFirstTimeThenProgramPipelineSelectionAndMediaSampler) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {1, 1, 1}; + + UserEvent userEvent(context); + cl_event blockedEvent = &userEvent; + + auto retVal = pCmdQ->enqueueKernel( + pKernel, + workDim, + globalWorkOffset, + globalWorkSize, + nullptr, + 1, + &blockedEvent, + nullptr); + ASSERT_EQ(CL_SUCCESS, retVal); + + userEvent.setStatus(CL_COMPLETE); + + parseCommands(*pCmdQ); + ASSERT_NE(cmdPipelineSelect, nullptr); + auto *pCmd = genCmdCast(cmdPipelineSelect); + + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_TRUE(pCmd->getMediaSamplerDopClockGateEnable()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); + pCmdQ->releaseVirtualEvent(); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueVmeKernelFirstTimeThenProgramPipelineSelectionAndMediaSampler) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pVmeKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + + auto numCommands = getCommandsList().size(); + EXPECT_EQ(1u, numCommands); + + auto pCmd = getCommand(); + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_FALSE(pCmd->getMediaSamplerDopClockGateEnable()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueNonVmeKernelFirstTimeThenProgramPipelineSelectionAndMediaSampler) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + + auto numCommands = getCommandsList().size(); + EXPECT_EQ(1u, numCommands); + + auto pCmd = getCommand(); + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_TRUE(pCmd->getMediaSamplerDopClockGateEnable()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueVmeKernelTwiceThenProgramPipelineSelectOnce) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pVmeKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + + auto numCommands = getCommandsList().size(); + EXPECT_EQ(1u, numCommands); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueNonVmeKernelTwiceThenProgramPipelineSelectOnce) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pVmeKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + auto numCommands = getCommandsList().size(); + EXPECT_EQ(1u, numCommands); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueVmeKernelAfterNonVmeKernelThenProgramPipelineSelectionAndMediaSamplerTwice) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + + retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pVmeKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + + auto commands = getCommandsList(); + EXPECT_EQ(2u, commands.size()); + + auto pCmd = static_cast(commands.back()); + + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_FALSE(pCmd->getMediaSamplerDopClockGateEnable()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueNonVmeKernelAfterVmeKernelThenProgramProgramPipelineSelectionAndMediaSamplerTwice) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pVmeKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + + retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + + auto commands = getCommandsList(); + EXPECT_EQ(2u, commands.size()); + + auto pCmd = static_cast(commands.back()); + + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_TRUE(pCmd->getMediaSamplerDopClockGateEnable()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueVmeKernelThenVmeSubslicesConfigDoesntChangeToFalse) { + auto csr = static_cast *>(&pDevice->getGpgpuCommandStreamReceiver()); + csr->lastVmeSubslicesConfig = true; + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pVmeKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + EXPECT_TRUE(csr->lastVmeSubslicesConfig); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusCsrWhenEnqueueVmeKernelThenVmeSubslicesConfigDoesntChangeToTrue) { + auto csr = static_cast *>(&pDevice->getGpgpuCommandStreamReceiver()); + csr->lastVmeSubslicesConfig = false; + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + pVmeKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + + parseCommands(*pCmdQ); + + itorWalker1 = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorWalker1); + EXPECT_FALSE(csr->lastVmeSubslicesConfig); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, WhenGettingCmdSizeForVmeThenZeroIsReturned) { + auto csr = static_cast *>(&pDevice->getGpgpuCommandStreamReceiver()); + + csr->lastVmeSubslicesConfig = false; + EXPECT_EQ(0u, csr->getCmdSizeForMediaSampler(false)); + EXPECT_EQ(0u, csr->getCmdSizeForMediaSampler(true)); + + csr->lastVmeSubslicesConfig = true; + EXPECT_EQ(0u, csr->getCmdSizeForMediaSampler(false)); + EXPECT_EQ(0u, csr->getCmdSizeForMediaSampler(true)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusWhenEnqueueSystolicKernelThenPipelineSelectEnablesSystolicMode) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + + MockKernelWithInternals mockKernel(*pClDevice, context); + mockKernel.mockKernel->setSpecialPipelineSelectMode(true); + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + mockKernel.mockKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_TRUE(mockKernel.mockKernel->requiresSpecialPipelineSelectMode()); + + parseCommands(*pCmdQ); + + auto numCommands = getCommandCount(); + EXPECT_EQ(1u, numCommands); + + auto pCmd = getCommand(); + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_TRUE(pCmd->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusWhenEnqueueNonSystolicKernelThenPipelineSelectDisablesSystolicMode) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + + MockKernelWithInternals mockKernel(*pClDevice, context); + mockKernel.mockKernel->setSpecialPipelineSelectMode(false); + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + mockKernel.mockKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_FALSE(mockKernel.mockKernel->requiresSpecialPipelineSelectMode()); + + parseCommands(*pCmdQ); + + auto numCommands = getCommandCount(); + EXPECT_EQ(1u, numCommands); + + auto pCmd = getCommand(); + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusWhenEnqueueTwoSystolicKernelsThenPipelineSelectEnablesSystolicModeOnce) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + + MockKernelWithInternals mockKernel(*pClDevice, context); + mockKernel.mockKernel->setSpecialPipelineSelectMode(true); + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + mockKernel.mockKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_TRUE(mockKernel.mockKernel->requiresSpecialPipelineSelectMode()); + + MockKernelWithInternals mockKernel2(*pClDevice, context); + mockKernel2.mockKernel->setSpecialPipelineSelectMode(true); + retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + mockKernel2.mockKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_TRUE(mockKernel2.mockKernel->requiresSpecialPipelineSelectMode()); + + parseCommands(*pCmdQ); + + auto numCommands = getCommandCount(); + EXPECT_EQ(1u, numCommands); + + auto pCmd = getCommand(); + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_TRUE(pCmd->getSystolicModeEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MediaKernelTest, givenXeHPPlusWhenEnqueueTwoKernelsThenPipelineSelectEnablesSystolicModeWhenNeeded) { + typedef typename FamilyType::PIPELINE_SELECT PIPELINE_SELECT; + + MockKernelWithInternals mockKernel(*pClDevice, context); + mockKernel.mockKernel->setSpecialPipelineSelectMode(false); + auto retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + mockKernel.mockKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_FALSE(mockKernel.mockKernel->requiresSpecialPipelineSelectMode()); + + MockKernelWithInternals mockKernel2(*pClDevice, context); + mockKernel2.mockKernel->setSpecialPipelineSelectMode(true); + retVal = EnqueueKernelHelper<>::enqueueKernel( + pCmdQ, + mockKernel2.mockKernel); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_TRUE(mockKernel2.mockKernel->requiresSpecialPipelineSelectMode()); + + parseCommands(*pCmdQ); + + auto numCommands = getCommandCount(); + EXPECT_EQ(2u, numCommands); + + auto expectedMask = pipelineSelectEnablePipelineSelectMaskBits | pipelineSelectMediaSamplerDopClockGateMaskBits | pipelineSelectSystolicModeEnableMaskBits; + auto expectedPipelineSelection = PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU; + + auto itorCmd = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorCmd); + auto pCmd = genCmdCast(*itorCmd); + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_FALSE(pCmd->getSystolicModeEnable()); + + itorCmd = find(++itorCmd, cmdList.end()); + ASSERT_NE(cmdList.end(), itorCmd); + pCmd = genCmdCast(*itorCmd); + EXPECT_EQ(expectedMask, pCmd->getMaskBits()); + EXPECT_EQ(expectedPipelineSelection, pCmd->getPipelineSelection()); + EXPECT_TRUE(pCmd->getSystolicModeEnable()); +} diff --git a/opencl/test/unit_test/command_queue/enqueue_resource_barier_tests_xehp_plus.cpp b/opencl/test/unit_test/command_queue/enqueue_resource_barier_tests_xehp_plus.cpp new file mode 100644 index 0000000000..ac8aaeb3a2 --- /dev/null +++ b/opencl/test/unit_test/command_queue/enqueue_resource_barier_tests_xehp_plus.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/os_interface/os_context.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/mocks/mock_graphics_allocation.h" + +#include "opencl/source/command_queue/resource_barrier.h" +#include "opencl/source/event/event_builder.h" +#include "opencl/source/event/user_event.h" +#include "opencl/source/mem_obj/buffer.h" +#include "opencl/source/memory_manager/resource_surface.h" +#include "opencl/test/unit_test/fixtures/enqueue_handler_fixture.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_timestamp_container.h" +#include "test.h" + +namespace NEO { + +template +class MockCommandQueueWithCacheFlush : public MockCommandQueueHw { + using MockCommandQueueHw::MockCommandQueueHw; + + public: + bool isCacheFlushCommand(uint32_t commandType) const override { + return commandRequireCacheFlush; + } + bool commandRequireCacheFlush = false; +}; + +using EnqueueResourceBarrierTestXeHpCorePlus = EnqueueHandlerTest; + +HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueResourceBarrierTestXeHpCorePlus, GivenCommandStreamWithoutKernelAndTimestampPacketEnabledWhenEnqueuedResourceBarrierWithEventThenTimestampAddedToEvent) { + DebugManagerStateRestore dbgRestore; + DebugManager.flags.EnableTimestampPacket.set(1); + pDevice->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + std::unique_ptr> mockCmdQ(new MockCommandQueueWithCacheFlush(context, pClDevice, 0)); + mockCmdQ->commandRequireCacheFlush = true; + + auto retVal = CL_INVALID_VALUE; + size_t bufferSize = MemoryConstants::pageSize; + std::unique_ptr buffer(Buffer::create( + context, + CL_MEM_READ_WRITE, + bufferSize, + nullptr, + retVal)); + auto allocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex()); + std::unique_ptr surface(new ResourceSurface(allocation, CL_RESOURCE_BARRIER_TYPE_RELEASE, CL_MEMORY_SCOPE_DEVICE)); + + MockTimestampPacketContainer timestamp1(*pDevice->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event1(mockCmdQ.get(), 0, 0, 0); + cl_event event2; + event1.addTimestampPacketNodes(timestamp1); + + cl_event waitlist[] = {&event1}; + + cl_resource_barrier_descriptor_intel descriptor{}; + descriptor.mem_object = buffer.get(); + descriptor.svm_allocation_pointer = nullptr; + + BarrierCommand barrierCommand(mockCmdQ.get(), &descriptor, 1); + + retVal = mockCmdQ->enqueueResourceBarrier( + &barrierCommand, + 1, + waitlist, + &event2); + + auto eventObj = castToObjectOrAbort(event2); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(eventObj->getTimestampPacketNodes()->peekNodes().size(), 1u); + eventObj->release(); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueResourceBarrierTestXeHpCorePlus, GivenCommandStreamWithoutKernelAndTimestampPacketDisabledWhenEnqueuedResourceBarrierWithEventThenTimestampNotAddedToEvent) { + DebugManagerStateRestore dbgRestore; + DebugManager.flags.EnableTimestampPacket.set(0); + static_cast *>(&pDevice->getGpgpuCommandStreamReceiver())->timestampPacketWriteEnabled = false; + std::unique_ptr> mockCmdQ(new MockCommandQueueWithCacheFlush(context, pClDevice, 0)); + mockCmdQ->commandRequireCacheFlush = true; + mockCmdQ->timestampPacketContainer.reset(); + auto retVal = CL_INVALID_VALUE; + size_t bufferSize = MemoryConstants::pageSize; + std::unique_ptr buffer(Buffer::create( + context, + CL_MEM_READ_WRITE, + bufferSize, + nullptr, + retVal)); + auto allocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex()); + std::unique_ptr surface(new ResourceSurface(allocation, CL_RESOURCE_BARRIER_TYPE_RELEASE, CL_MEMORY_SCOPE_DEVICE)); + + Event event1(mockCmdQ.get(), 0, 0, 0); + cl_event event2; + + cl_event waitlist[] = {&event1}; + + cl_resource_barrier_descriptor_intel descriptor{}; + descriptor.mem_object = buffer.get(); + descriptor.svm_allocation_pointer = nullptr; + + BarrierCommand barrierCommand(mockCmdQ.get(), &descriptor, 1); + + retVal = mockCmdQ->enqueueResourceBarrier( + &barrierCommand, + 1, + waitlist, + &event2); + auto eventObj = castToObjectOrAbort(event2); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(nullptr, eventObj->getTimestampPacketNodes()); + eventObj->release(); +} + +} // namespace NEO diff --git a/opencl/test/unit_test/command_queue/get_command_queue_info_tests.cpp b/opencl/test/unit_test/command_queue/get_command_queue_info_tests.cpp index 4e21222c06..5f72d4bd2c 100644 --- a/opencl/test/unit_test/command_queue/get_command_queue_info_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_command_queue_info_tests.cpp @@ -5,12 +5,14 @@ * */ +#include "shared/test/common/helpers/debug_manager_state_restore.h" + #include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" #include "opencl/test/unit_test/fixtures/context_fixture.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" - -#include "gtest/gtest.h" +#include "opencl/test/unit_test/mocks/mock_platform.h" +#include "test.h" using namespace NEO; @@ -122,7 +124,9 @@ INSTANTIATE_TEST_CASE_P( GetCommandQueueInfoTest, ::testing::ValuesIn(DefaultCommandQueueProperties)); -TEST(GetCommandQueueFamilyInfoTest, givenQueueFamilyNotSelectedWhenGettingFamilyAndQueueIndexThenValuesAreReturned) { +using GetCommandQueueFamilyInfoTests = ::testing::Test; + +TEST_F(GetCommandQueueFamilyInfoTests, givenQueueFamilyNotSelectedWhenGettingFamilyAndQueueIndexThenValuesAreReturned) { MockContext context{}; MockCommandQueue queue{context}; queue.queueFamilySelected = false; @@ -153,7 +157,7 @@ TEST(GetCommandQueueFamilyInfoTest, givenQueueFamilyNotSelectedWhenGettingFamily EXPECT_EQ(0u, queueIndex); } -TEST(GetCommandQueueFamilyInfoTest, givenQueueFamilySelectedWhenGettingFamilyAndQueueIndexThenValuesAreReturned) { +TEST_F(GetCommandQueueFamilyInfoTests, givenQueueFamilySelectedWhenGettingFamilyAndQueueIndexThenValuesAreReturned) { MockCommandQueue queue; queue.queueFamilySelected = true; queue.queueFamilyIndex = 12u; @@ -178,3 +182,87 @@ TEST(GetCommandQueueFamilyInfoTest, givenQueueFamilySelectedWhenGettingFamilyAnd EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_EQ(queue.queueIndexWithinFamily, queueIndex); } + +HWCMDTEST_F(IGFX_XE_HP_CORE, GetCommandQueueFamilyInfoTests, givenFamilyIdWhenGettingCommandQueueInfoThenCorrectValueIsReturned) { + HardwareInfo hwInfo = *defaultHwInfo.get(); + hwInfo.featureTable.ftrCCSNode = true; + MockClDevice mockClDevice{MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0)}; + + const cl_device_id deviceId = &mockClDevice; + auto context = clCreateContext(nullptr, 1, &deviceId, nullptr, nullptr, nullptr); + auto ccsFamily = mockClDevice.getDevice().getIndexOfNonEmptyEngineGroup(EngineGroupType::Compute); + cl_command_queue_properties properties[] = {CL_QUEUE_FAMILY_INTEL, ccsFamily, CL_QUEUE_INDEX_INTEL, 0, 0}; + EXPECT_EQ(1u, mockClDevice.getNumAvailableDevices()); + auto commandQueue = clCreateCommandQueueWithProperties(context, deviceId, properties, nullptr); + auto neoQueue = castToObject(commandQueue); + + cl_uint familyParameter; + auto retVal = neoQueue->getCommandQueueInfo( + CL_QUEUE_FAMILY_INTEL, + sizeof(familyParameter), + &familyParameter, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(ccsFamily, familyParameter); + + cl_uint indexParameter; + retVal = neoQueue->getCommandQueueInfo( + CL_QUEUE_INDEX_INTEL, + sizeof(indexParameter), + &indexParameter, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(0u, indexParameter); + + clReleaseCommandQueue(commandQueue); + clReleaseContext(context); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, GetCommandQueueFamilyInfoTests, givenFamilyIdWhenCreatingCommandQueueForRootDeviceWithMultipleSubDevicesThenInvalidValueIsReturned) { + DebugManagerStateRestore restorer; + DebugManager.flags.CreateMultipleSubDevices.set(2); + initPlatform(); + + auto rootDevice = platform()->getClDevice(0); + const cl_device_id deviceId = rootDevice; + auto context = clCreateContext(nullptr, 1, &deviceId, nullptr, nullptr, nullptr); + + cl_command_queue_properties properties[] = {CL_QUEUE_FAMILY_INTEL, static_cast(EngineGroupType::Compute), CL_QUEUE_INDEX_INTEL, 0, 0}; + EXPECT_EQ(2u, rootDevice->getNumAvailableDevices()); + cl_int retVal; + auto commandQueue = clCreateCommandQueueWithProperties(context, rootDevice, properties, &retVal); + + EXPECT_EQ(CL_INVALID_QUEUE_PROPERTIES, retVal); + EXPECT_EQ(nullptr, commandQueue); + + clReleaseContext(context); +} + +using MultiEngineQueueHwTests = ::testing::Test; +HWCMDTEST_F(IGFX_XE_HP_CORE, MultiEngineQueueHwTests, givenLimitedNumberOfCcsWhenCreatingCmdQueueThenFailOnNotSupportedCcs) { + HardwareInfo localHwInfo = *defaultHwInfo; + localHwInfo.gtSystemInfo.CCSInfo.IsValid = true; + localHwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 4; + localHwInfo.gtSystemInfo.CCSInfo.Instances.CCSEnableMask = 0b1111; + localHwInfo.featureTable.ftrCCSNode = true; + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&localHwInfo)); + MockContext context(device.get()); + context.contextType = ContextType::CONTEXT_TYPE_UNRESTRICTIVE; + + const uint32_t ccsCount = 4; + + auto ccsEngine = device->getDevice().getIndexOfNonEmptyEngineGroup(EngineGroupType::Compute); + cl_queue_properties properties[5] = {CL_QUEUE_FAMILY_INTEL, ccsEngine, CL_QUEUE_INDEX_INTEL, 0, 0}; + + auto mutableHwInfo = device->getRootDeviceEnvironment().getMutableHardwareInfo(); + + for (uint32_t i = 0; i < ccsCount; i++) { + properties[3] = i; + mutableHwInfo->gtSystemInfo.CCSInfo.Instances.CCSEnableMask = (1 << i); + + cl_int retVal = CL_SUCCESS; + cl_command_queue clCommandQueue = clCreateCommandQueueWithProperties(&context, device.get(), properties, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + clReleaseCommandQueue(clCommandQueue); + } +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_stream/CMakeLists.txt b/opencl/test/unit_test/command_stream/CMakeLists.txt index dcd284ee4d..93a28151db 100644 --- a/opencl/test/unit_test/command_stream/CMakeLists.txt +++ b/opencl/test/unit_test/command_stream/CMakeLists.txt @@ -29,6 +29,7 @@ set(IGDRCL_SRCS_tests_command_stream ${CMAKE_CURRENT_SOURCE_DIR}/create_command_stream_receiver_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/get_devices_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/experimental_command_buffer_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling_ocl_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/linear_stream_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/linear_stream_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/submissions_aggregator_tests.cpp @@ -41,5 +42,16 @@ set(IGDRCL_SRCS_tests_command_stream ${CMAKE_CURRENT_SOURCE_DIR}/compute_mode_tests.h ) +if(TESTS_XEHP_PLUS) + list(APPEND IGDRCL_SRCS_tests_command_stream + ${CMAKE_CURRENT_SOURCE_DIR}/aub_command_stream_receiver_tests_xehp_plus.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_stream_receiver_flush_task_tests_xehp_plus.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_stream_receiver_hw_tests_xehp_plus.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_stream_receiver_simulated_common_hw_tests_xehp_plus.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/compute_mode_tests_xehp_plus.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tbx_command_stream_receiver_tests_xehp_plus.cpp + ) +endif() + target_sources(igdrcl_tests PRIVATE ${IGDRCL_SRCS_tests_command_stream}) add_subdirectories() diff --git a/opencl/test/unit_test/command_stream/aub_command_stream_receiver_tests_xehp_plus.cpp b/opencl/test/unit_test/command_stream/aub_command_stream_receiver_tests_xehp_plus.cpp new file mode 100644 index 0000000000..d29e31aa13 --- /dev/null +++ b/opencl/test/unit_test/command_stream/aub_command_stream_receiver_tests_xehp_plus.cpp @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/aub_mem_dump/page_table_entry_bits.h" +#include "shared/source/helpers/engine_node_helper.h" +#include "shared/source/memory_manager/memory_banks.h" +#include "shared/source/memory_manager/memory_pool.h" +#include "shared/source/os_interface/device_factory.h" +#include "shared/source/os_interface/os_context.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/variable_backup.h" +#include "shared/test/common/mocks/mock_execution_environment.h" +#include "shared/test/common/mocks/mock_graphics_allocation.h" + +#include "opencl/source/command_stream/aub_command_stream_receiver_hw.h" +#include "opencl/source/helpers/memory_properties_helpers.h" +#include "opencl/source/mem_obj/buffer.h" +#include "opencl/test/unit_test/fixtures/cl_device_fixture.h" +#include "opencl/test/unit_test/helpers/hw_helper_tests.h" +#include "opencl/test/unit_test/libult/ult_aub_command_stream_receiver.h" +#include "opencl/test/unit_test/mocks/mock_aub_csr.h" +#include "opencl/test/unit_test/mocks/mock_context.h" +#include "opencl/test/unit_test/mocks/mock_os_context.h" +#include "opencl/test/unit_test/mocks/mock_platform.h" +#include "test.h" + +#include +#include + +using namespace NEO; + +struct XeHPPlusAubCommandStreamReceiverTests : ClDeviceFixture, ::testing::Test { + template + void setUpImpl() { + hardwareInfo = *defaultHwInfo; + hardwareInfoSetup[hardwareInfo.platform.eProductFamily](&hardwareInfo, true, 0); + hardwareInfo.gtSystemInfo.MultiTileArchInfo.IsValid = true; + ClDeviceFixture::SetUpImpl(&hardwareInfo); + } + + void SetUp() override { + } + + void TearDown() override { + ClDeviceFixture::TearDown(); + } +}; + +template +class MockAubCsrXeHPPlus : public AUBCommandStreamReceiverHw { + public: + using AUBCommandStreamReceiverHw::getAddressSpace; + using CommandStreamReceiverHw::localMemoryEnabled; + using CommandStreamReceiverSimulatedHw::createPhysicalAddressAllocator; + + MockAubCsrXeHPPlus(const std::string &fileName, + bool standalone, ExecutionEnvironment &executionEnvironment, + uint32_t rootDeviceIndex, + const DeviceBitfield deviceBitfield) + : AUBCommandStreamReceiverHw(fileName, standalone, executionEnvironment, rootDeviceIndex, deviceBitfield) {} + + uint32_t getDeviceIndex() const override { + return deviceIndex; + } + + uint32_t deviceIndex = 0u; +}; + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenGetGUCWorkQueueItemHeaderIsCalledThenAppropriateValueDependingOnEngineTypeIsReturned) { + setUpImpl(); + + MockOsContext rcsOsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_RCS, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs0OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs1OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS1, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs2OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS2, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs3OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS3, EngineUsage::Regular}, PreemptionMode::Disabled, false); + std::unique_ptr> aubCsr(new AUBCommandStreamReceiverHw("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + + aubCsr->setupContext(ccs0OsContext); + EXPECT_EQ(0x00030401u, aubCsr->getGUCWorkQueueItemHeader()); + aubCsr->setupContext(ccs1OsContext); + EXPECT_EQ(0x00030401u, aubCsr->getGUCWorkQueueItemHeader()); + aubCsr->setupContext(ccs2OsContext); + EXPECT_EQ(0x00030401u, aubCsr->getGUCWorkQueueItemHeader()); + aubCsr->setupContext(ccs3OsContext); + EXPECT_EQ(0x00030401u, aubCsr->getGUCWorkQueueItemHeader()); + aubCsr->setupContext(rcsOsContext); + EXPECT_EQ(0x00030001u, aubCsr->getGUCWorkQueueItemHeader()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenGraphicsAlloctionWithNonLocalMemoryPoolWhenGetPPGTTAdditionalBitsIsCalledThenAppropriateValueIsReturned) { + setUpImpl(); + + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(false); + + std::unique_ptr> aubCsr(new AUBCommandStreamReceiverHw("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + MockGraphicsAllocation allocation(nullptr, 0); + auto bits = aubCsr->getPPGTTAdditionalBits(&allocation); + + EXPECT_EQ(3u, bits); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenGraphicsAlloctionWithLocalMemoryPoolWhenGetPPGTTAdditionalBitsIsCalledThenAppropriateValueIsReturned) { + setUpImpl(); + + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(false); + + std::unique_ptr> aubCsr(new AUBCommandStreamReceiverHw("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + MockGraphicsAllocation allocation(nullptr, 0); + allocation.overrideMemoryPool(MemoryPool::LocalMemory); + auto bits = aubCsr->getPPGTTAdditionalBits(&allocation); + + EXPECT_EQ(3u | (1 << 11), bits); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenAubDumpForceAllToLocalMemoryPoolWhenGetPPGTTAdditionalBitsIsCalledThenLocalBitIsReturned) { + setUpImpl(); + + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(true); + + std::unique_ptr> aubCsr(new AUBCommandStreamReceiverHw("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + MockGraphicsAllocation allocation(nullptr, 0); + + auto bits = aubCsr->getPPGTTAdditionalBits(&allocation); + + EXPECT_EQ(3u | (1 << 11), bits); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenAubDumpForceAllToLocalMemoryEnabledWhenGetAddressSpaceIsCalledThenTraceLocalIsReturned) { + setUpImpl(); + + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(true); + + std::unique_ptr> aubCsr(new MockAubCsrXeHPPlus("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + auto stream = std::make_unique(); + aubCsr->stream = stream.get(); + + auto addressSpace = aubCsr->getAddressSpace(AubMemDump::DataTypeHintValues::TraceNotype); + + EXPECT_EQ(AubMemDump::AddressSpaceValues::TraceLocal, addressSpace); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenAubDumpForceAllToLocalMemoryDisabledWhenGetAddressSpaceIsCalledThenTraceNonlocalIsReturned) { + setUpImpl(); + + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(false); + + std::unique_ptr> aubCsr(new MockAubCsrXeHPPlus("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + auto stream = std::make_unique(); + aubCsr->stream = stream.get(); + + auto addressSpace = aubCsr->getAddressSpace(AubMemDump::DataTypeHintValues::TraceNotype); + + EXPECT_EQ(AubMemDump::AddressSpaceValues::TraceNonlocal, addressSpace); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenCCSEnabledWhenEngineMmiosAreInitializedThenExpectL3ConfigMmioIsWritten) { + setUpImpl(); + + MockOsContext osContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS, EngineUsage::Regular}, PreemptionMode::Disabled, false); + AUBCommandStreamReceiverHw aubCsr("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + aubCsr.setupContext(osContext); + + auto stream = std::make_unique(); + aubCsr.stream = stream.get(); + + aubCsr.initEngineMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0xB234, 0xA0000000u))); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenRCSEnabledWhenEngineMmiosAreInitializedThenExpectL3ConfigMmioIsWritten) { + setUpImpl(); + + MockOsContext osContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_RCS, EngineUsage::Regular}, PreemptionMode::Disabled, false); + AUBCommandStreamReceiverHw aubCsr("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + aubCsr.setupContext(osContext); + + auto stream = std::make_unique(); + aubCsr.stream = stream.get(); + + aubCsr.initEngineMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0xB134, 0xA0000000u))); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenLocaLMemoryBitWhenGetAddressSpaceFromPTEBitsIsCalledThenTraceLocalIsReturned) { + setUpImpl(); + + std::unique_ptr> aubCsr(new MockAubCsrXeHPPlus("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + auto stream = std::make_unique(); + aubCsr->stream = stream.get(); + + uint64_t bits = BIT(PageTableEntry::presentBit) | BIT(PageTableEntry::writableBit) | BIT(PageTableEntry::localMemoryBit); + auto addressSpace = aubCsr->getAddressSpaceFromPTEBits(bits); + EXPECT_EQ(AubMemDump::AddressSpaceValues::TraceLocal, addressSpace); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenLocalMemoryEnabledWhenGetMemoryBankForGttIsCalledThenCorrectBankForDeviceIsReturned) { + setUpImpl(); + + std::unique_ptr> aubCsr(new MockAubCsrXeHPPlus("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + aubCsr->localMemoryEnabled = true; + + auto bank = aubCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(0), bank); + + aubCsr->deviceIndex = 1u; + bank = aubCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(1), bank); + + aubCsr->deviceIndex = 2u; + bank = aubCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(2), bank); + + aubCsr->deviceIndex = 3u; + bank = aubCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(3), bank); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, whenPhysicalAllocatorIsCreatedThenItHasCorrectBankSzieAndNumberOfBanks) { + setUpImpl(); + + std::unique_ptr> aubCsr(new MockAubCsrXeHPPlus("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + + auto physicalAddressAllocator = std::unique_ptr(aubCsr->createPhysicalAddressAllocator(&pDevice->getHardwareInfo())); + auto allocator = reinterpret_cast *>(physicalAddressAllocator.get()); + + EXPECT_EQ(32 * MemoryConstants::gigaByte, allocator->getBankSize()); + EXPECT_EQ(1u, allocator->getNumberOfBanks()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, whenPhysicalAllocatorIsCreatedWith4TileConfigThenItHasCorrectBankSzieAndNumberOfBanks) { + DebugManagerStateRestore restorer; + DebugManager.flags.CreateMultipleSubDevices.set(4); + setUpImpl(); + + std::unique_ptr> aubCsr(new MockAubCsrXeHPPlus("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + + auto physicalAddressAllocator = std::unique_ptr(aubCsr->createPhysicalAddressAllocator(&pDevice->getHardwareInfo())); + auto allocator = reinterpret_cast *>(physicalAddressAllocator.get()); + + EXPECT_EQ(8 * MemoryConstants::gigaByte, allocator->getBankSize()); + EXPECT_EQ(4u, allocator->getNumberOfBanks()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenInitEngineMMIOIsCalledForGivenEngineTypeThenCorrespondingMmiosAreInitialized) { + setUpImpl(); + + DebugManagerStateRestore debugRestorer; + MockOsContext rcsOsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_RCS, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs0OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs1OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS1, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs2OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS2, EngineUsage::Regular}, PreemptionMode::Disabled, false); + MockOsContext ccs3OsContext(0, 1, EngineTypeUsage{aub_stream::ENGINE_CCS3, EngineUsage::Regular}, PreemptionMode::Disabled, false); + + auto aubCsr = std::make_unique>("", true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + EXPECT_NE(nullptr, aubCsr); + + auto stream = std::make_unique(); + aubCsr->stream = stream.get(); + + aubCsr->setupContext(rcsOsContext); + aubCsr->initEngineMMIO(); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0002000 + 0x000058, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0002000 + 0x00029c, 0xffff8280))); + + aubCsr->setupContext(ccs0OsContext); + aubCsr->initEngineMMIO(); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000ce90, 0x00030003))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x1a000 + 0x000029c, 0xffff8280))); + + aubCsr->setupContext(ccs1OsContext); + aubCsr->initEngineMMIO(); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000ce90, 0x00030003))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x1c000 + 0x000029c, 0xffff8280))); + + aubCsr->setupContext(ccs2OsContext); + aubCsr->initEngineMMIO(); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000ce90, 0x00030003))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x1e000 + 0x000029c, 0xffff8280))); + + aubCsr->setupContext(ccs3OsContext); + aubCsr->initEngineMMIO(); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000ce90, 0x00030003))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x26000 + 0x000029c, 0xffff8280))); +} + +template +static void checkCcsEngineMMIO(aub_stream::EngineType engineType, uint32_t mmioBase) { + auto &mmioList = *AUBFamilyMapper::perEngineMMIO[engineType]; + + EXPECT_EQ(mmioList[0], MMIOPair(0x0000ce90, 0x00030003)); + EXPECT_EQ(mmioList[1], MMIOPair(0x0000b170, 0x00030003)); + EXPECT_EQ(mmioList[2], MMIOPair(0x00014800, 0xFFFF0001)); + EXPECT_EQ(mmioList[3], MMIOPair(mmioBase + 0x000029c, 0xffff8280)); + + EXPECT_EQ(mmioList[4], MMIOPair(mmioBase + 0x00004d0, 0x0000e000)); + EXPECT_EQ(mmioList[5], MMIOPair(mmioBase + 0x00004d4, 0x0000e000)); + EXPECT_EQ(mmioList[6], MMIOPair(mmioBase + 0x00004d8, 0x0000e000)); + EXPECT_EQ(mmioList[7], MMIOPair(mmioBase + 0x00004dc, 0x0000e000)); + EXPECT_EQ(mmioList[8], MMIOPair(mmioBase + 0x00004e0, 0x0000e000)); + EXPECT_EQ(mmioList[9], MMIOPair(mmioBase + 0x00004e4, 0x0000e000)); + EXPECT_EQ(mmioList[10], MMIOPair(mmioBase + 0x00004e8, 0x0000e000)); + EXPECT_EQ(mmioList[11], MMIOPair(mmioBase + 0x00004ec, 0x0000e000)); + EXPECT_EQ(mmioList[12], MMIOPair(mmioBase + 0x00004f0, 0x0000e000)); + EXPECT_EQ(mmioList[13], MMIOPair(mmioBase + 0x00004f4, 0x0000e000)); + EXPECT_EQ(mmioList[14], MMIOPair(mmioBase + 0x00004f8, 0x0000e000)); + EXPECT_EQ(mmioList[15], MMIOPair(mmioBase + 0x00004fc, 0x0000e000)); + + EXPECT_EQ(mmioList[16], MMIOPair(0x0000B234, 0xA0000000)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenCcsEngineMmioListForSpecificCcsInstanceIsReadThenItIsInitializedWithProperValues) { + setUpImpl(); + + checkCcsEngineMMIO(aub_stream::ENGINE_CCS, 0x1a000); + checkCcsEngineMMIO(aub_stream::ENGINE_CCS1, 0x1c000); + checkCcsEngineMMIO(aub_stream::ENGINE_CCS2, 0x1e000); + checkCcsEngineMMIO(aub_stream::ENGINE_CCS3, 0x26000); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenRcsEngineMmioListIsReadThenItIsInitializedWithProperValues) { + setUpImpl(); + + auto &mmioList = *AUBFamilyMapper::perEngineMMIO[aub_stream::ENGINE_RCS]; + auto mmioBase = 0x002000; + + EXPECT_EQ(mmioList[0], MMIOPair(mmioBase + 0x000058, 0x00000000)); + EXPECT_EQ(mmioList[1], MMIOPair(mmioBase + 0x0000a8, 0x00000000)); + EXPECT_EQ(mmioList[2], MMIOPair(mmioBase + 0x000029c, 0xffff8280)); + + EXPECT_EQ(mmioList[3], MMIOPair(0x00002090, 0xffff0000)); + EXPECT_EQ(mmioList[4], MMIOPair(0x000020e0, 0xffff4000)); + EXPECT_EQ(mmioList[5], MMIOPair(0x000020e4, 0xffff0000)); + EXPECT_EQ(mmioList[6], MMIOPair(0x000020ec, 0xffff0051)); + + EXPECT_EQ(mmioList[7], MMIOPair(mmioBase + 0x00004d0, 0x00007014)); + EXPECT_EQ(mmioList[8], MMIOPair(mmioBase + 0x00004d4, 0x0000e000)); + EXPECT_EQ(mmioList[9], MMIOPair(mmioBase + 0x00004d8, 0x0000e000)); + EXPECT_EQ(mmioList[10], MMIOPair(mmioBase + 0x00004dc, 0x0000e000)); + EXPECT_EQ(mmioList[11], MMIOPair(mmioBase + 0x00004e0, 0x0000e000)); + EXPECT_EQ(mmioList[12], MMIOPair(mmioBase + 0x00004e4, 0x0000e000)); + EXPECT_EQ(mmioList[13], MMIOPair(mmioBase + 0x00004e8, 0x0000e000)); + EXPECT_EQ(mmioList[14], MMIOPair(mmioBase + 0x00004ec, 0x0000e000)); + EXPECT_EQ(mmioList[15], MMIOPair(mmioBase + 0x00004f0, 0x0000e000)); + EXPECT_EQ(mmioList[16], MMIOPair(mmioBase + 0x00004f4, 0x0000e000)); + EXPECT_EQ(mmioList[17], MMIOPair(mmioBase + 0x00004f8, 0x0000e000)); + EXPECT_EQ(mmioList[18], MMIOPair(mmioBase + 0x00004fc, 0x0000e000)); + + EXPECT_EQ(mmioList[19], MMIOPair(0x00002580, 0xffff0005)); + EXPECT_EQ(mmioList[20], MMIOPair(0x0000e194, 0xffff0002)); + + EXPECT_EQ(mmioList[21], MMIOPair(0x0000B134, 0xA0000000)); +} + +using XeHPPlusAubCommandStreamReceiverTests2 = HwHelperTest; + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusAubCommandStreamReceiverTests2, givenLocalMemoryEnabledInCSRWhenGetGTTDataIsCalledThenLocalMemoryIsSet) { + DebugManagerStateRestore debugRestorer; + DebugManager.flags.EnableLocalMemory.set(1); + hardwareInfo.featureTable.ftrLocalMemory = true; + + std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo)); + std::unique_ptr> aubCsr(std::make_unique>("", true, *device->executionEnvironment, device->getRootDeviceIndex(), device->getDeviceBitfield())); + EXPECT_TRUE(aubCsr->localMemoryEnabled); + + AubGTTData data = {false, false}; + aubCsr->getGTTData(nullptr, data); + EXPECT_TRUE(data.localMemory); +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_plus.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_plus.cpp new file mode 100644 index 0000000000..7dff95e1ea --- /dev/null +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_plus.cpp @@ -0,0 +1,685 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/gmm_helper/gmm_helper.h" +#include "shared/source/helpers/state_base_address.h" +#include "shared/test/common/cmd_parse/hw_parse.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" + +#include "opencl/test/unit_test/fixtures/ult_command_stream_receiver_fixture.h" +#include "opencl/test/unit_test/libult/ult_command_stream_receiver.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_csr.h" +#include "opencl/test/unit_test/mocks/mock_submissions_aggregator.h" +#include "test.h" + +using namespace NEO; + +typedef UltCommandStreamReceiverTest CommandStreamReceiverFlushTaskXeHPPlusTests; + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, whenReprogrammingSshThenBindingTablePoolIsProgrammed) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + flushTask(commandStreamReceiver); + parseCommands(commandStreamReceiver.getCS(0)); + auto bindingTablePoolAlloc = getCommand(); + ASSERT_NE(nullptr, bindingTablePoolAlloc); + EXPECT_EQ(reinterpret_cast(ssh.getCpuBase()), bindingTablePoolAlloc->getBindingTablePoolBaseAddress()); + EXPECT_EQ(ssh.getHeapSizeInPages(), bindingTablePoolAlloc->getBindingTablePoolBufferSize()); + EXPECT_EQ(pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_STATE_HEAP_BUFFER), + bindingTablePoolAlloc->getSurfaceObjectControlStateIndexToMocsTables()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, whenReprogrammingSshThenBindingTablePoolIsProgrammedWithCachingOffWhenDebugKeyPresent) { + DebugManagerStateRestore restorer; + DebugManager.flags.DisableCachingForHeaps.set(1); + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + flushTask(commandStreamReceiver); + parseCommands(commandStreamReceiver.getCS(0)); + auto bindingTablePoolAlloc = getCommand(); + ASSERT_NE(nullptr, bindingTablePoolAlloc); + EXPECT_EQ(reinterpret_cast(ssh.getCpuBase()), bindingTablePoolAlloc->getBindingTablePoolBaseAddress()); + EXPECT_EQ(ssh.getHeapSizeInPages(), bindingTablePoolAlloc->getBindingTablePoolBufferSize()); + EXPECT_EQ(pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_SYSTEM_MEMORY_BUFFER_CACHELINE_MISALIGNED), + bindingTablePoolAlloc->getSurfaceObjectControlStateIndexToMocsTables()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, whenNotReprogrammingSshThenBindingTablePoolIsNotProgrammed) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + flushTask(commandStreamReceiver); + parseCommands(commandStreamReceiver.getCS(0)); + auto stateBaseAddress = getCommand(); + EXPECT_NE(nullptr, stateBaseAddress); + auto bindingTablePoolAlloc = getCommand(); + ASSERT_NE(nullptr, bindingTablePoolAlloc); + EXPECT_EQ(reinterpret_cast(ssh.getCpuBase()), bindingTablePoolAlloc->getBindingTablePoolBaseAddress()); + EXPECT_EQ(ssh.getHeapSizeInPages(), bindingTablePoolAlloc->getBindingTablePoolBufferSize()); + EXPECT_EQ(pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_STATE_HEAP_BUFFER), + bindingTablePoolAlloc->getSurfaceObjectControlStateIndexToMocsTables()); + + auto offset = commandStreamReceiver.getCS(0).getUsed(); + // make SBA dirty (using ioh as dsh and dsh as ioh just to force SBA reprogramming) + commandStreamReceiver.flushTask(commandStream, 0, ioh, dsh, ssh, taskLevel, flushTaskFlags, *pDevice); + + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.getCS(0), offset); + stateBaseAddress = hwParser.getCommand(); + EXPECT_NE(nullptr, stateBaseAddress); + bindingTablePoolAlloc = hwParser.getCommand(); + EXPECT_EQ(nullptr, bindingTablePoolAlloc); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenStateBaseAddressWhenItIsRequiredThenThereIsPipeControlPriorToItWithTextureCacheFlushAndHdc) { + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + configureCSRtoNonDirtyState(false); + ioh.replaceBuffer(ptrOffset(ioh.getCpuBase(), +1u), ioh.getMaxAvailableSpace() + MemoryConstants::pageSize * 3); + flushTask(commandStreamReceiver); + parseCommands(commandStreamReceiver.getCS(0)); + + auto stateBaseAddressItor = find(cmdList.begin(), cmdList.end()); + auto pipeControlItor = find(cmdList.begin(), stateBaseAddressItor); + EXPECT_NE(stateBaseAddressItor, pipeControlItor); + auto pipeControlCmd = reinterpret_cast(*pipeControlItor); + EXPECT_TRUE(pipeControlCmd->getTextureCacheInvalidationEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControlCmd->getDcFlushEnable()); + EXPECT_TRUE(pipeControlCmd->getHdcPipelineFlush()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, whenNotReprogrammingSshButInitProgrammingFlagsThenBindingTablePoolIsProgrammed) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + flushTask(commandStreamReceiver); + parseCommands(commandStreamReceiver.getCS(0)); + auto stateBaseAddress = getCommand(); + EXPECT_NE(nullptr, stateBaseAddress); + auto bindingTablePoolAlloc = getCommand(); + ASSERT_NE(nullptr, bindingTablePoolAlloc); + EXPECT_EQ(reinterpret_cast(ssh.getCpuBase()), bindingTablePoolAlloc->getBindingTablePoolBaseAddress()); + EXPECT_EQ(ssh.getHeapSizeInPages(), bindingTablePoolAlloc->getBindingTablePoolBufferSize()); + EXPECT_EQ(pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_STATE_HEAP_BUFFER), + bindingTablePoolAlloc->getSurfaceObjectControlStateIndexToMocsTables()); + + auto offset = commandStreamReceiver.getCS(0).getUsed(); + commandStreamReceiver.initProgrammingFlags(); + flushTask(commandStreamReceiver); + + HardwareParse hwParser; + hwParser.parseCommands(commandStreamReceiver.getCS(0), offset); + stateBaseAddress = hwParser.getCommand(); + EXPECT_NE(nullptr, stateBaseAddress); + bindingTablePoolAlloc = hwParser.getCommand(); + EXPECT_NE(nullptr, bindingTablePoolAlloc); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenSbaProgrammingWhenHeapsAreNotProvidedThenDontProgram) { + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + + uint64_t instructionHeapBase = 0x10000; + uint64_t internalHeapBase = 0x10000; + uint64_t generalStateBase = 0x30000; + STATE_BASE_ADDRESS sbaCmd; + StateBaseAddressHelper::programStateBaseAddress(&sbaCmd, + nullptr, + nullptr, + nullptr, + generalStateBase, + true, + 0, + internalHeapBase, + instructionHeapBase, + 0, + true, + false, + pDevice->getGmmHelper(), + false, + MemoryCompressionState::NotApplicable, + false, + 1u); + + EXPECT_FALSE(sbaCmd.getDynamicStateBaseAddressModifyEnable()); + EXPECT_FALSE(sbaCmd.getDynamicStateBufferSizeModifyEnable()); + EXPECT_EQ(0u, sbaCmd.getDynamicStateBaseAddress()); + EXPECT_EQ(0u, sbaCmd.getDynamicStateBufferSize()); + + EXPECT_FALSE(sbaCmd.getIndirectObjectBaseAddressModifyEnable()); + EXPECT_FALSE(sbaCmd.getIndirectObjectBufferSizeModifyEnable()); + EXPECT_EQ(0u, sbaCmd.getIndirectObjectBaseAddress()); + EXPECT_EQ(0u, sbaCmd.getIndirectObjectBufferSize()); + + EXPECT_FALSE(sbaCmd.getSurfaceStateBaseAddressModifyEnable()); + EXPECT_EQ(0u, sbaCmd.getSurfaceStateBaseAddress()); + + EXPECT_TRUE(sbaCmd.getInstructionBaseAddressModifyEnable()); + EXPECT_EQ(instructionHeapBase, sbaCmd.getInstructionBaseAddress()); + EXPECT_TRUE(sbaCmd.getInstructionBufferSizeModifyEnable()); + EXPECT_EQ(MemoryConstants::sizeOf4GBinPageEntities, sbaCmd.getInstructionBufferSize()); + + EXPECT_TRUE(sbaCmd.getGeneralStateBaseAddressModifyEnable()); + EXPECT_TRUE(sbaCmd.getGeneralStateBufferSizeModifyEnable()); + if constexpr (is64bit) { + EXPECT_EQ(GmmHelper::decanonize(internalHeapBase), sbaCmd.getGeneralStateBaseAddress()); + } else { + EXPECT_EQ(generalStateBase, sbaCmd.getGeneralStateBaseAddress()); + } + EXPECT_EQ(0xfffffu, sbaCmd.getGeneralStateBufferSize()); + + EXPECT_EQ(0u, sbaCmd.getBindlessSurfaceStateBaseAddress()); + EXPECT_FALSE(sbaCmd.getBindlessSurfaceStateBaseAddressModifyEnable()); + EXPECT_EQ(0u, sbaCmd.getBindlessSurfaceStateSize()); +} + +using isXeHPOrAbove = IsAtLeastProduct; +HWTEST2_F(CommandStreamReceiverFlushTaskXeHPPlusTests, whenFlushAllCachesVariableIsSetAndAddPipeControlIsCalledThenFieldsAreProperlySet, isXeHPOrAbove) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + DebugManagerStateRestore dbgRestorer; + DebugManager.flags.FlushAllCaches.set(true); + + char buff[sizeof(PIPE_CONTROL) * 3]; + LinearStream stream(buff, sizeof(PIPE_CONTROL) * 3); + + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControl(stream, args); + + parseCommands(stream, 0); + + PIPE_CONTROL *pipeControl = getCommand(); + + ASSERT_NE(nullptr, pipeControl); + + // WA pipeControl added + if (cmdList.size() == 2) { + pipeControl++; + } + + EXPECT_TRUE(pipeControl->getDcFlushEnable()); + EXPECT_TRUE(pipeControl->getRenderTargetCacheFlushEnable()); + EXPECT_TRUE(pipeControl->getInstructionCacheInvalidateEnable()); + EXPECT_TRUE(pipeControl->getTextureCacheInvalidationEnable()); + EXPECT_TRUE(pipeControl->getPipeControlFlushEnable()); + EXPECT_TRUE(pipeControl->getVfCacheInvalidationEnable()); + EXPECT_TRUE(pipeControl->getConstantCacheInvalidationEnable()); + EXPECT_TRUE(pipeControl->getStateCacheInvalidationEnable()); + // XeHP+ only field + EXPECT_TRUE(pipeControl->getCompressionControlSurfaceCcsFlush()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenconfigureCSRtoNonDirtyStateWhenFlushTaskIsCalledThenNoCommandsAreAdded) { + configureCSRtoNonDirtyState(true); + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + flushTask(commandStreamReceiver); + EXPECT_EQ(0u, commandStreamReceiver.commandStream.getUsed()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenMultiOsContextCommandStreamReceiverWhenFlushTaskIsCalledThenCommandStreamReceiverStreamIsUsed) { + configureCSRtoNonDirtyState(true); + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.multiOsContextCapable = true; + commandStream.getSpace(4); + + flushTask(commandStreamReceiver); + EXPECT_EQ(MemoryConstants::cacheLineSize, commandStreamReceiver.commandStream.getUsed()); + auto batchBufferStart = genCmdCast(commandStreamReceiver.commandStream.getCpuBase()); + EXPECT_NE(nullptr, batchBufferStart); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenCsrInBatchingModeWhenTaskIsSubmittedViaCsrThenBbEndCoversPaddingEnoughToFitMiBatchBufferStart) { + auto &mockCsr = pDevice->getUltCommandStreamReceiver(); + mockCsr.overrideDispatchPolicy(DispatchMode::BatchedDispatch); + mockCsr.timestampPacketWriteEnabled = false; + + configureCSRtoNonDirtyState(true); + + mockCsr.getCS(1024u); + auto &csrCommandStream = mockCsr.commandStream; + + //we do level change that will emit PPC, fill all the space so only BB end fits. + taskLevel++; + auto ppcSize = MemorySynchronizationCommands::getSizeForSinglePipeControl(); + auto fillSize = MemoryConstants::cacheLineSize - ppcSize - sizeof(typename FamilyType::MI_BATCH_BUFFER_END); + csrCommandStream.getSpace(fillSize); + auto expectedUsedSize = 2 * MemoryConstants::cacheLineSize; + + flushTask(mockCsr); + + EXPECT_EQ(expectedUsedSize, mockCsr.commandStream.getUsed()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, GivenSameTaskLevelThenDontSendPipeControl) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + // Configure the CSR to not need to submit any state or commands. + configureCSRtoNonDirtyState(true); + + flushTask(commandStreamReceiver); + + EXPECT_EQ(taskLevel, commandStreamReceiver.taskLevel); + + auto sizeUsed = commandStreamReceiver.commandStream.getUsed(); + EXPECT_EQ(sizeUsed, 0u); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenDeviceWithThreadGroupPreemptionSupportThenDontSendMediaVfeStateIfNotDirty) { + DebugManagerStateRestore dbgRestore; + DebugManager.flags.ForcePreemptionMode.set(static_cast(PreemptionMode::ThreadGroup)); + + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->setPreemptionMode(PreemptionMode::ThreadGroup); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + + // Configure the CSR to not need to submit any state or commands. + configureCSRtoNonDirtyState(true); + + flushTask(*commandStreamReceiver); + + EXPECT_EQ(taskLevel, commandStreamReceiver->peekTaskLevel()); + + auto sizeUsed = commandStreamReceiver->commandStream.getUsed(); + EXPECT_EQ(0u, sizeUsed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenCommandStreamReceiverWithInstructionCacheRequestWhenFlushTaskIsCalledThenPipeControlWithInstructionCacheIsEmitted) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + configureCSRtoNonDirtyState(true); + + commandStreamReceiver.registerInstructionCacheFlush(); + EXPECT_EQ(1u, commandStreamReceiver.recursiveLockCounter); + + flushTask(commandStreamReceiver); + + parseCommands(commandStreamReceiver.commandStream, 0); + + auto itorPC = find(cmdList.begin(), cmdList.end()); + EXPECT_NE(cmdList.end(), itorPC); + auto pipeControlCmd = reinterpret_cast(*itorPC); + EXPECT_TRUE(pipeControlCmd->getInstructionCacheInvalidateEnable()); + EXPECT_FALSE(commandStreamReceiver.requiresInstructionCacheFlush); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenHigherTaskLevelWhenTimestampPacketWriteIsEnabledThenDontAddPipeControl) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.timestampPacketWriteEnabled = true; + commandStreamReceiver.isPreambleSent = true; + configureCSRtoNonDirtyState(true); + commandStreamReceiver.taskLevel = taskLevel; + taskLevel++; // submit with higher taskLevel + + flushTask(commandStreamReceiver); + + parseCommands(commandStreamReceiver.commandStream, 0); + + auto itorPC = find(cmdList.begin(), cmdList.end()); + EXPECT_EQ(cmdList.end(), itorPC); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, WhenForcePipeControlPriorToWalkerIsSetThenAddExtraPipeControls) { + DebugManagerStateRestore stateResore; + DebugManager.flags.ForcePipeControlPriorToWalker.set(true); + DebugManager.flags.FlushAllCaches.set(true); + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.isPreambleSent = true; + configureCSRtoNonDirtyState(true); + commandStreamReceiver.taskLevel = taskLevel; + + flushTask(commandStreamReceiver); + + parseCommands(commandStreamReceiver.commandStream, 0); + + GenCmdList::iterator itor = cmdList.begin(); + int counterPC = 0; + while (itor != cmdList.end()) { + auto pipeControl = genCmdCast(*itor); + if (pipeControl) { + switch (counterPC) { + case 0: // First pipe control with CS Stall + EXPECT_EQ(bool(pipeControl->getCommandStreamerStallEnable()), true); + EXPECT_EQ(bool(pipeControl->getDcFlushEnable()), false); + EXPECT_EQ(bool(pipeControl->getRenderTargetCacheFlushEnable()), false); + EXPECT_EQ(bool(pipeControl->getInstructionCacheInvalidateEnable()), false); + EXPECT_EQ(bool(pipeControl->getTextureCacheInvalidationEnable()), false); + EXPECT_EQ(bool(pipeControl->getPipeControlFlushEnable()), false); + EXPECT_EQ(bool(pipeControl->getVfCacheInvalidationEnable()), false); + EXPECT_EQ(bool(pipeControl->getConstantCacheInvalidationEnable()), false); + EXPECT_EQ(bool(pipeControl->getStateCacheInvalidationEnable()), false); + break; + case 1: // Second pipe control with all flushes + EXPECT_EQ(bool(pipeControl->getCommandStreamerStallEnable()), true); + EXPECT_EQ(bool(pipeControl->getDcFlushEnable()), true); + EXPECT_EQ(bool(pipeControl->getRenderTargetCacheFlushEnable()), true); + EXPECT_EQ(bool(pipeControl->getInstructionCacheInvalidateEnable()), true); + EXPECT_EQ(bool(pipeControl->getTextureCacheInvalidationEnable()), true); + EXPECT_EQ(bool(pipeControl->getPipeControlFlushEnable()), true); + EXPECT_EQ(bool(pipeControl->getVfCacheInvalidationEnable()), true); + EXPECT_EQ(bool(pipeControl->getConstantCacheInvalidationEnable()), true); + EXPECT_EQ(bool(pipeControl->getStateCacheInvalidationEnable()), true); + default: + break; + } + counterPC++; + } + + ++itor; + } + + EXPECT_EQ(counterPC, 2); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, whenSamplerCacheFlushNotRequiredThenDontSendPipecontrol) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + NEO::WorkaroundTable *waTable = &pDevice->getRootDeviceEnvironment().getMutableHardwareInfo()->workaroundTable; + + commandStreamReceiver.isPreambleSent = true; + commandStreamReceiver.lastPreemptionMode = pDevice->getPreemptionMode(); + commandStreamReceiver.setSamplerCacheFlushRequired(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushNotRequired); + configureCSRtoNonDirtyState(true); + commandStreamReceiver.taskLevel = taskLevel; + waTable->waSamplerCacheFlushBetweenRedescribedSurfaceReads = true; + flushTask(commandStreamReceiver); + + EXPECT_EQ(commandStreamReceiver.commandStream.getUsed(), 0u); + EXPECT_EQ(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushNotRequired, commandStreamReceiver.samplerCacheFlushRequired); + + parseCommands(commandStreamReceiver.commandStream, 0); + + auto itorPC = find(cmdList.begin(), cmdList.end()); + EXPECT_EQ(cmdList.end(), itorPC); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, whenSamplerCacheFlushBeforeAndWaSamplerCacheFlushBetweenRedescribedSurfaceReadsDasabledThenDontSendPipecontrol) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.isPreambleSent = true; + commandStreamReceiver.setSamplerCacheFlushRequired(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore); + configureCSRtoNonDirtyState(true); + commandStreamReceiver.taskLevel = taskLevel; + NEO::WorkaroundTable *waTable = &pDevice->getRootDeviceEnvironment().getMutableHardwareInfo()->workaroundTable; + + waTable->waSamplerCacheFlushBetweenRedescribedSurfaceReads = false; + + flushTask(commandStreamReceiver); + + EXPECT_EQ(commandStreamReceiver.commandStream.getUsed(), 0u); + EXPECT_EQ(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore, commandStreamReceiver.samplerCacheFlushRequired); + + parseCommands(commandStreamReceiver.commandStream, 0); + + auto itorPC = find(cmdList.begin(), cmdList.end()); + EXPECT_EQ(cmdList.end(), itorPC); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, WhenFlushingTaskThenStateBaseAddressProgrammingShouldMatchTracking) { + typedef typename FamilyType::STATE_BASE_ADDRESS STATE_BASE_ADDRESS; + auto gmmHelper = pDevice->getGmmHelper(); + auto stateHeapMocs = gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_STATE_HEAP_BUFFER); + auto l1CacheOnMocs = gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CONST); + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + flushTask(commandStreamReceiver); + + auto &commandStreamCSR = commandStreamReceiver.commandStream; + HardwareParse::parseCommands(commandStreamCSR, 0); + HardwareParse::findHardwareCommands(); + + ASSERT_NE(nullptr, cmdStateBaseAddress); + auto &cmd = *reinterpret_cast(cmdStateBaseAddress); + + EXPECT_EQ(dsh.getCpuBase(), reinterpret_cast(cmd.getDynamicStateBaseAddress())); + EXPECT_EQ(commandStreamReceiver.getMemoryManager()->getInternalHeapBaseAddress(commandStreamReceiver.rootDeviceIndex, ioh.getGraphicsAllocation()->isAllocatedInLocalMemoryPool()), cmd.getInstructionBaseAddress()); + EXPECT_EQ(ioh.getCpuBase(), reinterpret_cast(cmd.getIndirectObjectBaseAddress())); + EXPECT_EQ(ssh.getCpuBase(), reinterpret_cast(cmd.getSurfaceStateBaseAddress())); + + EXPECT_EQ(l1CacheOnMocs, cmd.getStatelessDataPortAccessMemoryObjectControlState()); + EXPECT_EQ(stateHeapMocs, cmd.getInstructionMemoryObjectControlState()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, GivenBlockingWhenFlushingTaskThenPipeControlProgrammedCorrectly) { + typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; + CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + + // Configure the CSR to not need to submit any state or commands + configureCSRtoNonDirtyState(true); + + // Force a PIPE_CONTROL through a blocking flag + auto blocking = true; + auto &commandStreamTask = commandQueue.getCS(1024); + auto &commandStreamCSR = commandStreamReceiver->getCS(); + commandStreamReceiver->lastSentCoherencyRequest = 0; + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + dispatchFlags.blocking = blocking; + dispatchFlags.guardCommandBufferWithPipeControl = true; + + commandStreamReceiver->flushTask( + commandStreamTask, + 0, + dsh, + ioh, + ssh, + taskLevel, + dispatchFlags, + *pDevice); + + // Verify that taskCS got modified, while csrCS remained intact + EXPECT_GT(commandStreamTask.getUsed(), 0u); + EXPECT_EQ(0u, commandStreamCSR.getUsed()); + + // Parse command list to verify that PC got added to taskCS + cmdList.clear(); + parseCommands(commandStreamTask, 0); + auto itorTaskCS = find(cmdList.begin(), cmdList.end()); + EXPECT_NE(cmdList.end(), itorTaskCS); + + // Parse command list to verify that PC wasn't added to csrCS + cmdList.clear(); + parseCommands(commandStreamCSR, 0); + auto numberOfPC = getCommandsList().size(); + EXPECT_EQ(0u, numberOfPC); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenCsrInNonDirtyStateWhenflushTaskIsCalledThenNoFlushIsCalled) { + CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); + auto &commandStream = commandQueue.getCS(4096u); + + auto mockCsr = new MockCsrHw2(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(mockCsr); + + configureCSRtoNonDirtyState(true); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + + mockCsr->flushTask(commandStream, + 0, + dsh, + ioh, + ssh, + taskLevel, + dispatchFlags, + *pDevice); + + EXPECT_EQ(0, mockCsr->flushCalledCount); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenCsrInNonDirtyStateAndBatchingModeWhenflushTaskIsCalledWithDisabledPreemptionThenSubmissionIsNotRecorded) { + CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); + auto &commandStream = commandQueue.getCS(4096u); + + auto mockCsr = new MockCsrHw2(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(mockCsr); + + mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch); + + auto mockedSubmissionsAggregator = new mockSubmissionsAggregator(); + mockCsr->overrideSubmissionAggregator(mockedSubmissionsAggregator); + + configureCSRtoNonDirtyState(true); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + + mockCsr->flushTask(commandStream, + 0, + dsh, + ioh, + ssh, + taskLevel, + dispatchFlags, + *pDevice); + + EXPECT_EQ(0, mockCsr->flushCalledCount); + + EXPECT_TRUE(mockedSubmissionsAggregator->peekCmdBufferList().peekIsEmpty()); + + //surfaces are non resident + auto &surfacesForResidency = mockCsr->getResidencyAllocations(); + EXPECT_EQ(0u, surfacesForResidency.size()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenCsrInBatchingModeWhenRecordedBatchBufferIsBeingSubmittedThenFlushIsCalledWithRecordedCommandBuffer) { + CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); + auto &commandStream = commandQueue.getCS(4096u); + + auto mockCsr = new MockCsrHw2(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(mockCsr); + mockCsr->useNewResourceImplicitFlush = false; + mockCsr->useGpuIdleImplicitFlush = false; + mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch); + + auto mockedSubmissionsAggregator = new mockSubmissionsAggregator(); + mockCsr->overrideSubmissionAggregator(mockedSubmissionsAggregator); + + configureCSRtoNonDirtyState(true); + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + dispatchFlags.guardCommandBufferWithPipeControl = true; + dispatchFlags.requiresCoherency = true; + + mockCsr->lastSentCoherencyRequest = 1; + + commandStream.getSpace(4); + + mockCsr->flushTask(commandStream, + 4, + dsh, + ioh, + ssh, + taskLevel, + dispatchFlags, + *pDevice); + + EXPECT_EQ(0, mockCsr->flushCalledCount); + + auto &surfacesForResidency = mockCsr->getResidencyAllocations(); + EXPECT_EQ(0u, surfacesForResidency.size()); + + auto &cmdBufferList = mockedSubmissionsAggregator->peekCommandBuffers(); + EXPECT_FALSE(cmdBufferList.peekIsEmpty()); + auto cmdBuffer = cmdBufferList.peekHead(); + + //preemption allocation + sip kernel + size_t csrSurfaceCount = (pDevice->getPreemptionMode() == PreemptionMode::MidThread) ? 2 : 0; + csrSurfaceCount += mockCsr->globalFenceAllocation ? 1 : 0; + csrSurfaceCount += mockCsr->clearColorAllocation ? 1 : 0; + + EXPECT_EQ(4u + csrSurfaceCount, cmdBuffer->surfaces.size()); + + //copy those surfaces + std::vector residentSurfaces = cmdBuffer->surfaces; + + for (auto &graphicsAllocation : residentSurfaces) { + EXPECT_TRUE(graphicsAllocation->isResident(mockCsr->getOsContext().getContextId())); + EXPECT_EQ(1u, graphicsAllocation->getResidencyTaskCount(mockCsr->getOsContext().getContextId())); + } + + mockCsr->flushBatchedSubmissions(); + + EXPECT_FALSE(mockCsr->recordedCommandBuffer->batchBuffer.low_priority); + EXPECT_TRUE(mockCsr->recordedCommandBuffer->batchBuffer.requiresCoherency); + EXPECT_EQ(mockCsr->recordedCommandBuffer->batchBuffer.commandBufferAllocation, commandStream.getGraphicsAllocation()); + EXPECT_EQ(4u, mockCsr->recordedCommandBuffer->batchBuffer.startOffset); + EXPECT_EQ(1, mockCsr->flushCalledCount); + + EXPECT_TRUE(mockedSubmissionsAggregator->peekCommandBuffers().peekIsEmpty()); + + EXPECT_EQ(0u, surfacesForResidency.size()); + + for (auto &graphicsAllocation : residentSurfaces) { + EXPECT_FALSE(graphicsAllocation->isResident(mockCsr->getOsContext().getContextId())); + } +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenNothingToFlushWhenFlushTaskCalledThenDontFlushStamp) { + auto mockCsr = new MockCsrHw2(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(mockCsr); + + configureCSRtoNonDirtyState(true); + + EXPECT_EQ(0, mockCsr->flushCalledCount); + auto previousFlushStamp = mockCsr->flushStamp->peekStamp(); + auto cmplStamp = flushTask(*mockCsr); + EXPECT_EQ(mockCsr->flushStamp->peekStamp(), previousFlushStamp); + EXPECT_EQ(previousFlushStamp, cmplStamp.flushStamp); + EXPECT_EQ(0, mockCsr->flushCalledCount); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPPlusTests, givenEpilogueRequiredFlagWhenTaskIsSubmittedDirectlyThenItPointsBackToCsr) { + configureCSRtoNonDirtyState(true); + auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver(); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + + EXPECT_EQ(0u, commandStreamReceiver.getCmdSizeForEpilogue(dispatchFlags)); + + dispatchFlags.epilogueRequired = true; + dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + + EXPECT_EQ(MemoryConstants::cacheLineSize, commandStreamReceiver.getCmdSizeForEpilogue(dispatchFlags)); + + auto data = commandStream.getSpace(MemoryConstants::cacheLineSize); + memset(data, 0, MemoryConstants::cacheLineSize); + commandStreamReceiver.storeMakeResidentAllocations = true; + commandStreamReceiver.flushTask(commandStream, + 0, + dsh, + ioh, + ssh, + taskLevel, + dispatchFlags, + *pDevice); + auto &commandStreamReceiverStream = commandStreamReceiver.getCS(0u); + + EXPECT_EQ(MemoryConstants::cacheLineSize * 2, commandStream.getUsed()); + EXPECT_EQ(MemoryConstants::cacheLineSize, commandStreamReceiverStream.getUsed()); + + parseCommands(commandStream, 0); + + auto itBBend = find(cmdList.begin(), cmdList.end()); + EXPECT_EQ(itBBend, cmdList.end()); + + auto itBatchBufferStart = find(cmdList.begin(), cmdList.end()); + EXPECT_NE(itBatchBufferStart, cmdList.end()); + + auto batchBufferStart = genCmdCast(*itBatchBufferStart); + EXPECT_EQ(batchBufferStart->getBatchBufferStartAddressGraphicsaddress472(), commandStreamReceiverStream.getGraphicsAllocation()->getGpuAddress()); + + parseCommands(commandStreamReceiverStream, 0); + + itBBend = find(cmdList.begin(), cmdList.end()); + void *bbEndAddress = *itBBend; + + EXPECT_EQ(commandStreamReceiverStream.getCpuBase(), bbEndAddress); + + EXPECT_TRUE(commandStreamReceiver.isMadeResident(commandStreamReceiverStream.getGraphicsAllocation())); +} diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_plus.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_plus.cpp new file mode 100644 index 0000000000..3ef23b7a25 --- /dev/null +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_plus.cpp @@ -0,0 +1,852 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_stream/command_stream_receiver.h" +#include "shared/source/command_stream/linear_stream.h" +#include "shared/source/command_stream/preemption.h" +#include "shared/source/command_stream/scratch_space_controller.h" +#include "shared/source/command_stream/scratch_space_controller_xehp_plus.h" +#include "shared/source/gmm_helper/gmm.h" +#include "shared/source/gmm_helper/gmm_helper.h" +#include "shared/source/helpers/blit_commands_helper.h" +#include "shared/source/helpers/hw_helper.h" +#include "shared/source/helpers/timestamp_packet.h" +#include "shared/source/os_interface/os_interface.h" +#include "shared/test/common/cmd_parse/hw_parse.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/helpers/variable_backup.h" +#include "shared/test/common/mocks/ult_device_factory.h" +#include "shared/test/unit_test/utilities/base_object_utils.h" + +#include "opencl/source/command_queue/command_queue_hw.h" +#include "opencl/source/command_queue/resource_barrier.h" +#include "opencl/source/mem_obj/buffer.h" +#include "opencl/test/unit_test/fixtures/cl_device_fixture.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_context.h" +#include "opencl/test/unit_test/mocks/mock_csr.h" +#include "opencl/test/unit_test/mocks/mock_event.h" +#include "opencl/test/unit_test/mocks/mock_kernel.h" +#include "opencl/test/unit_test/mocks/mock_memory_manager.h" +#include "opencl/test/unit_test/mocks/mock_platform.h" +#include "opencl/test/unit_test/mocks/mock_scratch_space_controller_xehp_plus.h" +#include "opencl/test/unit_test/mocks/mock_timestamp_container.h" +#include "test.h" + +#include "gtest/gtest.h" +#include "reg_configs_common.h" + +using namespace NEO; + +namespace NEO { +template +class ImplicitFlushSettings { + public: + static bool &getSettingForNewResource(); + static bool &getSettingForGpuIdle(); + + private: + static bool defaultSettingForNewResource; + static bool defaultSettingForGpuIdle; +}; +} // namespace NEO + +struct CommandStreamReceiverHwTestXeHPPlus : public ClDeviceFixture, + public HardwareParse, + public ::testing::Test { + + void SetUp() override { + ClDeviceFixture::SetUp(); + HardwareParse::SetUp(); + } + + void TearDown() override { + HardwareParse::TearDown(); + ClDeviceFixture::TearDown(); + } +}; + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenPreambleSentWhenL3ConfigRequestChangedThenDontProgramL3Register) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + size_t GWS = 1; + MockContext ctx(pClDevice); + MockKernelWithInternals kernel(*pClDevice); + CommandQueueHw commandQueue(&ctx, pClDevice, 0, false); + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + auto &commandStreamCSR = commandStreamReceiver->getCS(); + + PreemptionMode initialPreemptionMode = commandStreamReceiver->lastPreemptionMode; + PreemptionMode devicePreemptionMode = pDevice->getPreemptionMode(); + + commandStreamReceiver->isPreambleSent = true; + commandStreamReceiver->lastSentL3Config = 0; + + commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr); + + parseCommands(commandStreamCSR, 0); + auto itorCmd = find(cmdList.begin(), cmdList.end()); + if (PreemptionHelper::getRequiredCmdStreamSize(initialPreemptionMode, devicePreemptionMode) > 0u) { + ASSERT_NE(cmdList.end(), itorCmd); + } else { + EXPECT_EQ(cmdList.end(), itorCmd); + } +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, WhenCommandStreamReceiverHwIsCreatedThenDefaultSshSizeIs2MB) { + auto &commandStreamReceiver = pDevice->getGpgpuCommandStreamReceiver(); + EXPECT_EQ(2 * MB, commandStreamReceiver.defaultSshSize); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, WhenScratchSpaceExistsThenReturnNonZeroGpuAddressToPatch) { + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + void *ssh = alignedMalloc(512, 4096); + + uint32_t perThreadScratchSize = 0x400; + + bool stateBaseAddressDirty = false; + bool cfeStateDirty = false; + commandStreamReceiver->getScratchSpaceController()->setRequiredScratchSpace(ssh, 0u, perThreadScratchSize, 0u, 0u, *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + ASSERT_NE(nullptr, commandStreamReceiver->getScratchAllocation()); + EXPECT_TRUE(cfeStateDirty); + + auto scratchSpaceAddr = commandStreamReceiver->getScratchPatchAddress(); + constexpr uint64_t notExpectedScratchGpuAddr = 0; + EXPECT_NE(notExpectedScratchGpuAddr, scratchSpaceAddr); + alignedFree(ssh); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, WhenOsContextSupportsMultipleDevicesThenScratchSpaceAllocationIsPlacedOnEachSupportedDevice) { + DebugManagerStateRestore restorer; + DebugManager.flags.CreateMultipleSubDevices.set(2u); + ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment(); + executionEnvironment->memoryManager.reset(new MockMemoryManager(false, true, *executionEnvironment)); + uint32_t tileMask = 0b11; + std::unique_ptr osContext(OsContext::create(nullptr, 0u, tileMask, EngineTypeUsage{aub_stream::ENGINE_CCS, EngineUsage::Regular}, PreemptionMode::MidThread, + false)); + auto commandStreamReceiver = std::make_unique>(*executionEnvironment, 0, tileMask); + initPlatform(); + + void *ssh = alignedMalloc(512, 4096); + + uint32_t perThreadScratchSize = 0x400; + + bool stateBaseAddressDirty = false; + bool cfeStateDirty = false; + commandStreamReceiver->getScratchSpaceController()->setRequiredScratchSpace(ssh, 0u, perThreadScratchSize, 0u, 0u, *osContext, stateBaseAddressDirty, cfeStateDirty); + auto allocation = commandStreamReceiver->getScratchAllocation(); + EXPECT_EQ(tileMask, static_cast(allocation->storageInfo.memoryBanks.to_ulong())); + alignedFree(ssh); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, WhenScratchSpaceNotExistThenReturnZeroGpuAddressToPatch) { + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + + auto scratchSpaceAddr = commandStreamReceiver.getScratchPatchAddress(); + constexpr uint64_t expectedScratchGpuAddr = 0; + EXPECT_EQ(expectedScratchGpuAddr, scratchSpaceAddr); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, whenProgrammingMiSemaphoreWaitThenSetRegisterPollModeMemoryPoll) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + MI_SEMAPHORE_WAIT miSemaphoreWait = FamilyType::cmdInitMiSemaphoreWait; + EXPECT_EQ(MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_MEMORY_POLL, miSemaphoreWait.getRegisterPollMode()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenSratchAllocationRequestedThenProgramCfeStateWithScratchAllocation) { + using CFE_STATE = typename FamilyType::CFE_STATE; + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + + const HardwareInfo &hwInfo = *defaultHwInfo; + size_t GWS = 1; + MockContext ctx(pClDevice); + MockKernelWithInternals kernel(*pClDevice); + CommandQueueHw commandQueue(&ctx, pClDevice, 0, false); + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver->getScratchSpaceController()); + scratchController->slotId = 2u; + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + auto &commandStreamCSR = commandStreamReceiver->getCS(); + + kernel.kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0] = 0x1000; + auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); + uint32_t computeUnits = hwHelper.getComputeUnitsUsedForScratch(&hwInfo); + size_t scratchSpaceSize = kernel.kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0] * computeUnits; + + commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr); + commandQueue.flush(); + + parseCommands(commandStreamCSR, 0); + findHardwareCommands(); + + EXPECT_EQ(kernel.kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0], commandStreamReceiver->requiredScratchSize); + EXPECT_EQ(scratchSpaceSize, scratchController->scratchSizeBytes); + EXPECT_EQ(scratchSpaceSize, scratchController->getScratchSpaceAllocation()->getUnderlyingBufferSize()); + ASSERT_NE(nullptr, cmdMediaVfeState); + auto cfeState = static_cast(cmdMediaVfeState); + uint32_t bufferOffset = static_cast(scratchController->slotId * scratchController->singleSurfaceStateSize * 2); + EXPECT_EQ(bufferOffset, cfeState->getScratchSpaceBuffer()); + RENDER_SURFACE_STATE *scratchState = reinterpret_cast(scratchController->surfaceStateHeap + bufferOffset); + EXPECT_EQ(scratchController->scratchAllocation->getGpuAddress(), scratchState->getSurfaceBaseAddress()); + EXPECT_EQ(RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_SCRATCH, scratchState->getSurfaceType()); + + SURFACE_STATE_BUFFER_LENGTH length = {0}; + length.Length = static_cast(computeUnits - 1); + EXPECT_EQ(length.SurfaceState.Depth + 1u, scratchState->getDepth()); + EXPECT_EQ(length.SurfaceState.Width + 1u, scratchState->getWidth()); + EXPECT_EQ(length.SurfaceState.Height + 1u, scratchState->getHeight()); + EXPECT_EQ(kernel.kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0], scratchState->getSurfacePitch()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenNewSshProvidedAndNoScratchAllocationExistThenNoDirtyBitSet) { + auto commandStreamReceiver = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver->getScratchSpaceController()); + + bool stateBaseAddressDirty = false; + bool cfeStateDirty = false; + scratchController->surfaceStateHeap = reinterpret_cast(0x1000); + scratchController->setRequiredScratchSpace(reinterpret_cast(0x2000), 0u, 0u, 0u, 0u, *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_EQ(scratchController->surfaceStateHeap, reinterpret_cast(0x2000)); + EXPECT_FALSE(cfeStateDirty); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenRequiredScratchSpaceIsSetThenPerThreadScratchSizeIsAlignedTo64) { + auto commandStreamReceiver = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver->getScratchSpaceController()); + + uint32_t perThreadScratchSize = 1; + uint32_t expectedValue = 1 << 6; + bool stateBaseAddressDirty = false; + bool cfeStateDirty = false; + uint8_t surfaceHeap[1000]; + scratchController->setRequiredScratchSpace(surfaceHeap, 0u, perThreadScratchSize, 0u, commandStreamReceiver->taskCount, *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_EQ(expectedValue, scratchController->perThreadScratchSize); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenNewSshProvidedAndScratchAllocationExistsThenSetDirtyBitCopyCurrentState) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + auto commandStreamReceiver = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver->getScratchSpaceController()); + scratchController->slotId = 0; + bool stateBaseAddressDirty = false; + bool cfeStateDirty = false; + + void *oldSurfaceHeap = alignedMalloc(0x1000, 0x1000); + scratchController->setRequiredScratchSpace(oldSurfaceHeap, 0u, 0x1000u, 0u, commandStreamReceiver->taskCount, *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + EXPECT_EQ(1u, scratchController->slotId); + EXPECT_EQ(scratchController->surfaceStateHeap, oldSurfaceHeap); + char *surfaceStateBuf = static_cast(oldSurfaceHeap) + scratchController->slotId * sizeof(RENDER_SURFACE_STATE) * 2; + GraphicsAllocation *scratchAllocation = scratchController->scratchAllocation; + RENDER_SURFACE_STATE *surfaceState = reinterpret_cast(surfaceStateBuf); + EXPECT_EQ(scratchController->scratchAllocation->getGpuAddress(), surfaceState->getSurfaceBaseAddress()); + EXPECT_EQ(RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_SCRATCH, surfaceState->getSurfaceType()); + + void *newSurfaceHeap = alignedMalloc(0x1000, 0x1000); + scratchController->setRequiredScratchSpace(newSurfaceHeap, 0u, 0x1000u, 0u, commandStreamReceiver->taskCount, *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + EXPECT_EQ(1u, scratchController->slotId); + EXPECT_EQ(scratchController->surfaceStateHeap, newSurfaceHeap); + EXPECT_EQ(scratchAllocation, scratchController->scratchAllocation); + surfaceStateBuf = static_cast(newSurfaceHeap) + scratchController->slotId * sizeof(RENDER_SURFACE_STATE) * 2; + surfaceState = reinterpret_cast(surfaceStateBuf); + EXPECT_EQ(scratchController->scratchAllocation->getGpuAddress(), surfaceState->getSurfaceBaseAddress()); + EXPECT_EQ(RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_SCRATCH, surfaceState->getSurfaceType()); + + alignedFree(oldSurfaceHeap); + alignedFree(newSurfaceHeap); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenBiggerScratchSpaceRequiredThenReplaceAllocation) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver->getScratchSpaceController()); + scratchController->slotId = 6; + + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + + void *surfaceHeap = alignedMalloc(0x1000, 0x1000); + scratchController->setRequiredScratchSpace(surfaceHeap, 0u, 0x1000u, 0u, commandStreamReceiver->taskCount, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + EXPECT_EQ(7u, scratchController->slotId); + uint64_t offset = static_cast(scratchController->slotId * sizeof(RENDER_SURFACE_STATE) * 2); + EXPECT_EQ(offset, scratchController->getScratchPatchAddress()); + EXPECT_EQ(0u, scratchController->calculateNewGSH()); + uint64_t gpuVa = scratchController->scratchAllocation->getGpuAddress(); + char *surfaceStateBuf = static_cast(scratchController->surfaceStateHeap) + offset; + RENDER_SURFACE_STATE *surfaceState = reinterpret_cast(surfaceStateBuf); + EXPECT_EQ(gpuVa, surfaceState->getSurfaceBaseAddress()); + + scratchController->setRequiredScratchSpace(surfaceHeap, 0u, 0x2000u, 0u, commandStreamReceiver->taskCount, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + EXPECT_EQ(8u, scratchController->slotId); + offset = static_cast(scratchController->slotId * sizeof(RENDER_SURFACE_STATE) * 2); + EXPECT_EQ(offset, scratchController->getScratchPatchAddress()); + EXPECT_NE(gpuVa, scratchController->scratchAllocation->getGpuAddress()); + gpuVa = scratchController->scratchAllocation->getGpuAddress(); + surfaceStateBuf = static_cast(scratchController->surfaceStateHeap) + offset; + surfaceState = reinterpret_cast(surfaceStateBuf); + EXPECT_EQ(gpuVa, surfaceState->getSurfaceBaseAddress()); + + alignedFree(surfaceHeap); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenScratchSlotIsNonZeroThenSlotIdIsUpdatedAndCorrectOffsetIsSet) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver->getScratchSpaceController()); + + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + + void *surfaceHeap = alignedMalloc(0x1000, 0x1000); + scratchController->setRequiredScratchSpace(surfaceHeap, 1u, 0x1000u, 0u, commandStreamReceiver->taskCount, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + EXPECT_EQ(1u, scratchController->slotId); + EXPECT_TRUE(scratchController->updateSlots); + uint64_t offset = static_cast(scratchController->slotId * sizeof(RENDER_SURFACE_STATE) * 2); + EXPECT_EQ(offset, scratchController->getScratchPatchAddress()); + EXPECT_EQ(0u, scratchController->calculateNewGSH()); + uint64_t gpuVa = scratchController->scratchAllocation->getGpuAddress(); + char *surfaceStateBuf = static_cast(scratchController->surfaceStateHeap) + offset; + RENDER_SURFACE_STATE *surfaceState = reinterpret_cast(surfaceStateBuf); + EXPECT_EQ(gpuVa, surfaceState->getSurfaceBaseAddress()); + alignedFree(surfaceHeap); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenProgramHeapsThenSetReqScratchSpaceAndProgramSurfaceStateAreCalled) { + class MockScratchSpaceControllerXeHPPlus : public ScratchSpaceControllerXeHPPlus { + public: + uint32_t requiredScratchSpaceCalledTimes = 0u; + uint32_t programSurfaceStateCalledTimes = 0u; + MockScratchSpaceControllerXeHPPlus(uint32_t rootDeviceIndex, + ExecutionEnvironment &environment, + InternalAllocationStorage &allocationStorage) : ScratchSpaceControllerXeHPPlus(rootDeviceIndex, environment, allocationStorage) {} + + using ScratchSpaceControllerXeHPPlus::scratchAllocation; + + void setRequiredScratchSpace(void *sshBaseAddress, + uint32_t scratchSlot, + uint32_t requiredPerThreadScratchSize, + uint32_t requiredPerThreadPrivateScratchSize, + uint32_t currentTaskCount, + OsContext &osContext, + bool &stateBaseAddressDirty, + bool &vfeStateDirty) override { + requiredScratchSpaceCalledTimes++; + } + + protected: + void programSurfaceState() override { + programSurfaceStateCalledTimes++; + }; + }; + + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + std::unique_ptr scratchController = std::make_unique(pDevice->getRootDeviceIndex(), + *pDevice->executionEnvironment, + *commandStreamReceiver->getInternalAllocationStorage()); + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + + void *surfaceHeap = alignedMalloc(0x1000, 0x1000); + NEO::GraphicsAllocation heap1(1u, NEO::GraphicsAllocation::AllocationType::BUFFER, surfaceHeap, 0u, 0u, 0u, MemoryPool::System4KBPages, 0u); + NEO::GraphicsAllocation heap2(1u, NEO::GraphicsAllocation::AllocationType::BUFFER, surfaceHeap, 0u, 0u, 0u, MemoryPool::System4KBPages, 0u); + NEO::GraphicsAllocation heap3(1u, NEO::GraphicsAllocation::AllocationType::BUFFER, surfaceHeap, 0u, 0u, 0u, MemoryPool::System4KBPages, 0u); + HeapContainer container; + + container.push_back(&heap1); + container.push_back(&heap2); + container.push_back(&heap3); + + scratchController->programHeaps(container, 0u, 1u, 0u, 0u, commandStreamReceiver->getOsContext(), stateBaseAddressDirty, cfeStateDirty); + + auto scratch = static_cast(scratchController.get()); + EXPECT_EQ(scratch->requiredScratchSpaceCalledTimes, 1u); + EXPECT_EQ(scratch->programSurfaceStateCalledTimes, 2u); + + alignedFree(surfaceHeap); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchWhenSetNewSshPtrAndChangeIdIsFalseThenSlotIdIsNotChanged) { + class MockScratchSpaceControllerXeHPPlus : public ScratchSpaceControllerXeHPPlus { + public: + uint32_t programSurfaceStateCalledTimes = 0u; + MockScratchSpaceControllerXeHPPlus(uint32_t rootDeviceIndex, + ExecutionEnvironment &environment, + InternalAllocationStorage &allocationStorage) : ScratchSpaceControllerXeHPPlus(rootDeviceIndex, environment, allocationStorage) {} + + using ScratchSpaceControllerXeHPPlus::scratchAllocation; + using ScratchSpaceControllerXeHPPlus::slotId; + + protected: + void programSurfaceState() override { + programSurfaceStateCalledTimes++; + }; + }; + + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + std::unique_ptr scratchController = std::make_unique(pDevice->getRootDeviceIndex(), + *pDevice->executionEnvironment, + *commandStreamReceiver->getInternalAllocationStorage()); + + NEO::GraphicsAllocation graphicsAllocation(1u, NEO::GraphicsAllocation::AllocationType::BUFFER, nullptr, 0u, 0u, 0u, MemoryPool::System4KBPages, 0u); + + bool cfeStateDirty = false; + + void *surfaceHeap = alignedMalloc(0x1000, 0x1000); + + auto scratch = static_cast(scratchController.get()); + scratch->slotId = 10; + scratch->scratchAllocation = &graphicsAllocation; + scratch->setNewSshPtr(surfaceHeap, cfeStateDirty, false); + scratch->scratchAllocation = nullptr; + EXPECT_EQ(10u, scratch->slotId); + EXPECT_EQ(scratch->programSurfaceStateCalledTimes, 1u); + EXPECT_TRUE(cfeStateDirty); + + alignedFree(surfaceHeap); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchWhenProgramSurfaceStateAndUpdateSlotIsFalseThenSlotIdIsNotChanged) { + class MockScratchSpaceControllerXeHPPlus : public ScratchSpaceControllerXeHPPlus { + public: + MockScratchSpaceControllerXeHPPlus(uint32_t rootDeviceIndex, + ExecutionEnvironment &environment, + InternalAllocationStorage &allocationStorage) : ScratchSpaceControllerXeHPPlus(rootDeviceIndex, environment, allocationStorage) {} + + using ScratchSpaceControllerXeHPPlus::programSurfaceState; + using ScratchSpaceControllerXeHPPlus::scratchAllocation; + using ScratchSpaceControllerXeHPPlus::slotId; + using ScratchSpaceControllerXeHPPlus::surfaceStateHeap; + using ScratchSpaceControllerXeHPPlus::updateSlots; + }; + + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + std::unique_ptr scratchController = std::make_unique(pDevice->getRootDeviceIndex(), + *pDevice->executionEnvironment, + *commandStreamReceiver->getInternalAllocationStorage()); + + NEO::GraphicsAllocation graphicsAllocation(1u, NEO::GraphicsAllocation::AllocationType::BUFFER, nullptr, 0u, 0u, 0u, MemoryPool::System4KBPages, 0u); + + void *surfaceHeap = alignedMalloc(0x1000, 0x1000); + + auto scratch = static_cast(scratchController.get()); + scratch->surfaceStateHeap = static_cast(surfaceHeap); + scratch->slotId = 10; + scratch->updateSlots = false; + scratch->scratchAllocation = &graphicsAllocation; + scratch->programSurfaceState(); + scratch->scratchAllocation = nullptr; + EXPECT_EQ(10u, scratch->slotId); + + alignedFree(surfaceHeap); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenBiggerPrivateScratchSpaceRequiredThenReplaceAllocation) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(1); + RENDER_SURFACE_STATE surfaceState[6]; + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(pDevice->getGpgpuCommandStreamReceiver().getOsContext()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + + uint32_t sizeForPrivateScratch = MemoryConstants::pageSize; + + scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, sizeForPrivateScratch, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + uint64_t gpuVa = scratchController->privateScratchAllocation->getGpuAddress(); + EXPECT_EQ(gpuVa, surfaceState[3].getSurfaceBaseAddress()); + + scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, sizeForPrivateScratch * 2, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + + EXPECT_NE(gpuVa, scratchController->privateScratchAllocation->getGpuAddress()); + EXPECT_EQ(scratchController->privateScratchAllocation->getGpuAddress(), surfaceState[5].getSurfaceBaseAddress()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceControllerWithOnlyPrivateScratchSpaceWhenGettingPatchAddressThenGetCorrectValue) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(1); + RENDER_SURFACE_STATE surfaceState[6]; + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(pDevice->getGpgpuCommandStreamReceiver().getOsContext()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + + uint32_t sizeForPrivateScratch = MemoryConstants::pageSize; + + EXPECT_EQ(nullptr, scratchController->getScratchSpaceAllocation()); + EXPECT_EQ(nullptr, scratchController->getPrivateScratchSpaceAllocation()); + + EXPECT_EQ(0u, scratchController->getScratchPatchAddress()); + + scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, sizeForPrivateScratch, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + auto expectedPatchAddress = 2 * sizeof(RENDER_SURFACE_STATE); + EXPECT_EQ(nullptr, scratchController->getScratchSpaceAllocation()); + EXPECT_NE(nullptr, scratchController->getPrivateScratchSpaceAllocation()); + + EXPECT_EQ(expectedPatchAddress, scratchController->getScratchPatchAddress()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenNotBiggerPrivateScratchSpaceRequiredThenCfeStateIsNotDirty) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(1); + RENDER_SURFACE_STATE surfaceState[4]; + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(pDevice->getGpgpuCommandStreamReceiver().getOsContext()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + + uint32_t sizeForPrivateScratch = MemoryConstants::pageSize; + + scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, sizeForPrivateScratch, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + uint64_t gpuVa = scratchController->privateScratchAllocation->getGpuAddress(); + cfeStateDirty = false; + + scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, sizeForPrivateScratch, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_FALSE(cfeStateDirty); + + EXPECT_EQ(gpuVa, scratchController->privateScratchAllocation->getGpuAddress()); + EXPECT_EQ(gpuVa, surfaceState[3].getSurfaceBaseAddress()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateWithoutPrivateScratchSpaceWhenDoubleAllocationsScratchSpaceIsUsedThenPrivateScratchAddressIsZero) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(1); + RENDER_SURFACE_STATE surfaceState[4]; + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(pDevice->getGpgpuCommandStreamReceiver().getOsContext()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + + uint32_t sizeForScratch = MemoryConstants::pageSize; + + scratchController->setRequiredScratchSpace(surfaceState, 0u, sizeForScratch, 0u, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_TRUE(cfeStateDirty); + EXPECT_EQ(nullptr, scratchController->privateScratchAllocation); + + EXPECT_EQ(0u, surfaceState[3].getSurfaceBaseAddress()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceControllerWhenDebugKeyForPrivateScratchIsDisabledThenThereAre16Slots) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(0); + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(pDevice->getGpgpuCommandStreamReceiver().getOsContext()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + EXPECT_EQ(16u, scratchController->stateSlotsCount); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceControllerWhenDebugKeyForPrivateScratchIsEnabledThenThereAre32Slots) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(1); + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(pDevice->getGpgpuCommandStreamReceiver().getOsContext()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + EXPECT_EQ(32u, scratchController->stateSlotsCount); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenScratchSpaceSurfaceStateEnabledWhenSizeForPrivateScratchSpaceIsMisalignedThenAlignItTo64) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(1); + RENDER_SURFACE_STATE surfaceState[4]; + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + + uint32_t misalignedSizeForPrivateScratch = MemoryConstants::pageSize + 1; + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + scratchController->setRequiredScratchSpace(surfaceState, 0u, 0u, misalignedSizeForPrivateScratch, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_NE(scratchController->privateScratchSizeBytes, misalignedSizeForPrivateScratch * scratchController->computeUnitsUsedForScratch); + EXPECT_EQ(scratchController->privateScratchSizeBytes, alignUp(misalignedSizeForPrivateScratch, 64) * scratchController->computeUnitsUsedForScratch); + EXPECT_EQ(scratchController->privateScratchSizeBytes, scratchController->getPrivateScratchSpaceAllocation()->getUnderlyingBufferSize()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenDisabledPrivateScratchSpaceWhenSizeForPrivateScratchSpaceIsProvidedThenItIsNotCreated) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(0); + RENDER_SURFACE_STATE surfaceState[4]; + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + + bool cfeStateDirty = false; + bool stateBaseAddressDirty = false; + scratchController->setRequiredScratchSpace(surfaceState, 0u, MemoryConstants::pageSize, MemoryConstants::pageSize, 0u, + *pDevice->getDefaultEngine().osContext, stateBaseAddressDirty, cfeStateDirty); + EXPECT_EQ(0u, scratchController->privateScratchSizeBytes); + EXPECT_EQ(nullptr, scratchController->getPrivateScratchSpaceAllocation()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenDisabledPrivateScratchSpaceWhenGettingOffsetForSlotThenEachSlotContainsOnlyOneSurfaceState) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + DebugManagerStateRestore restorer; + DebugManager.flags.EnablePrivateScratchSlot1.set(0); + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto scratchController = static_cast(commandStreamReceiver.getScratchSpaceController()); + EXPECT_EQ(sizeof(RENDER_SURFACE_STATE), scratchController->getOffsetToSurfaceState(1u)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenBlockedCacheFlushCmdWhenSubmittingThenDispatchBlockedCommands) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + MockContext context(pClDevice); + + auto mockCsr = new MockCsrHw2(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(mockCsr); + mockCsr->timestampPacketWriteEnabled = true; + mockCsr->storeFlushedTaskStream = true; + + auto cmdQ0 = clUniquePtr(new MockCommandQueueHw(&context, pClDevice, nullptr)); + + auto &secondEngine = pDevice->getEngine(pDevice->getHardwareInfo().capabilityTable.defaultEngineType, EngineUsage::LowPriority); + static_cast *>(secondEngine.commandStreamReceiver)->timestampPacketWriteEnabled = true; + + auto cmdQ1 = clUniquePtr(new MockCommandQueueHw(&context, pClDevice, nullptr)); + cmdQ1->gpgpuEngine = &secondEngine; + cmdQ1->timestampPacketContainer = std::make_unique(); + EXPECT_NE(&cmdQ0->getGpgpuCommandStreamReceiver(), &cmdQ1->getGpgpuCommandStreamReceiver()); + + MockTimestampPacketContainer node0(*pDevice->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + MockTimestampPacketContainer node1(*pDevice->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event0(cmdQ0.get(), 0, 0, 0); // on the same CSR + event0.addTimestampPacketNodes(node0); + Event event1(cmdQ1.get(), 0, 0, 0); // on different CSR + event1.addTimestampPacketNodes(node1); + + uint32_t numEventsOnWaitlist = 3; + + UserEvent userEvent; + cl_event waitlist[] = {&event0, &event1, &userEvent}; + + cl_int retVal = CL_SUCCESS; + auto buffer = clUniquePtr(Buffer::create(&context, 0, MemoryConstants::pageSize, nullptr, retVal)); + cl_resource_barrier_descriptor_intel descriptor = {}; + descriptor.mem_object = buffer.get(); + BarrierCommand barrierCommand(cmdQ0.get(), &descriptor, 1); + + cmdQ0->enqueueResourceBarrier(&barrierCommand, numEventsOnWaitlist, waitlist, nullptr); + + userEvent.setStatus(CL_COMPLETE); + + HardwareParse hwParserCsr; + HardwareParse hwParserCmdQ; + LinearStream taskStream(mockCsr->storedTaskStream.get(), mockCsr->storedTaskStreamSize); + taskStream.getSpace(mockCsr->storedTaskStreamSize); + hwParserCsr.parseCommands(mockCsr->commandStream, 0); + hwParserCmdQ.parseCommands(taskStream, 0); + + { + auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); + auto expectedQueueSemaphoresCount = 1u; + if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(pDevice->getHardwareInfo())) { + expectedQueueSemaphoresCount += 2; + } + EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); + auto semaphoreCmd = genCmdCast(*(queueSemaphores[0])); + EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + + auto dataAddress = TimestampPacketHelper::getContextEndGpuAddress(*node0.getNode(0)); + EXPECT_EQ(dataAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); + } + { + auto csrSemaphores = findAll(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end()); + EXPECT_EQ(1u, csrSemaphores.size()); + auto semaphoreCmd = genCmdCast(*(csrSemaphores[0])); + EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + + auto dataAddress = TimestampPacketHelper::getContextEndGpuAddress(*node1.getNode(0)); + + EXPECT_EQ(dataAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); + } + + EXPECT_TRUE(mockCsr->passedDispatchFlags.blocking); + EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl); + EXPECT_EQ(pDevice->getPreemptionMode(), mockCsr->passedDispatchFlags.preemptionMode); + + cmdQ0->isQueueBlocked(); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, WhenOsContextSupportsMultipleDevicesThenCommandStreamReceiverIsMultiOsContextCapable) { + uint32_t multiDeviceMask = 0b11; + uint32_t singleDeviceMask = 0b10; + std::unique_ptr multiDeviceOsContext(OsContext::create(nullptr, 0u, multiDeviceMask, EngineTypeUsage{aub_stream::ENGINE_RCS, EngineUsage::Regular}, PreemptionMode::MidThread, + false)); + std::unique_ptr singleDeviceOsContext(OsContext::create(nullptr, 0u, singleDeviceMask, EngineTypeUsage{aub_stream::ENGINE_RCS, EngineUsage::Regular}, PreemptionMode::MidThread, + false)); + + EXPECT_EQ(2u, multiDeviceOsContext->getNumSupportedDevices()); + EXPECT_EQ(1u, singleDeviceOsContext->getNumSupportedDevices()); + + UltCommandStreamReceiver commandStreamReceiverMulti(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), multiDeviceMask); + commandStreamReceiverMulti.callBaseIsMultiOsContextCapable = true; + EXPECT_TRUE(commandStreamReceiverMulti.isMultiOsContextCapable()); + EXPECT_EQ(2u, commandStreamReceiverMulti.deviceBitfield.count()); + + UltCommandStreamReceiver commandStreamReceiverSingle(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex(), singleDeviceMask); + commandStreamReceiverSingle.callBaseIsMultiOsContextCapable = true; + EXPECT_FALSE(commandStreamReceiverSingle.isMultiOsContextCapable()); + EXPECT_EQ(1u, commandStreamReceiverSingle.deviceBitfield.count()); +} + +HWTEST2_F(CommandStreamReceiverHwTestXeHPPlus, givenXE_HP_COREDefaultSupportEnabledWhenOsSupportsNewResourceImplicitFlushThenReturnOsSupportValue, IsXeHpCore) { + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(*osContext); + + EXPECT_TRUE(ImplicitFlushSettings::getSettingForNewResource()); + + VariableBackup defaultSettingForNewResourceBackup(&ImplicitFlushSettings::getSettingForNewResource(), true); + + if (commandStreamReceiver.getOSInterface()->newResourceImplicitFlush) { + EXPECT_TRUE(commandStreamReceiver.checkPlatformSupportsNewResourceImplicitFlush()); + } else { + EXPECT_FALSE(commandStreamReceiver.checkPlatformSupportsNewResourceImplicitFlush()); + } +} + +HWTEST2_F(CommandStreamReceiverHwTestXeHPPlus, givenXE_HP_COREDefaultSupportDisabledWhenOsSupportsNewResourceImplicitFlushThenReturnOsSupportValue, IsXeHpCore) { + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(*osContext); + + VariableBackup defaultSettingForNewResourceBackup(&ImplicitFlushSettings::getSettingForNewResource(), false); + + EXPECT_FALSE(commandStreamReceiver.checkPlatformSupportsNewResourceImplicitFlush()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenPlatformSupportsImplicitFlushForNewResourceWhenCsrIsMultiContextThenExpectNoSupport) { + VariableBackup defaultSettingForNewResourceBackup(&ImplicitFlushSettings::getSettingForNewResource(), true); + + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(*osContext); + commandStreamReceiver.multiOsContextCapable = true; + + EXPECT_TRUE(ImplicitFlushSettings::getSettingForNewResource()); + EXPECT_FALSE(commandStreamReceiver.checkPlatformSupportsNewResourceImplicitFlush()); +} + +HWTEST2_F(CommandStreamReceiverHwTestXeHPPlus, givenXE_HP_COREDefaultSupportEnabledWhenOsSupportsGpuIdleImplicitFlushThenReturnOsSupportValue, IsXeHpCore) { + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(*osContext); + + EXPECT_TRUE(ImplicitFlushSettings::getSettingForGpuIdle()); + + VariableBackup defaultSettingForGpuIdleBackup(&ImplicitFlushSettings::getSettingForGpuIdle(), true); + + if (commandStreamReceiver.getOSInterface()->newResourceImplicitFlush) { + EXPECT_TRUE(commandStreamReceiver.checkPlatformSupportsGpuIdleImplicitFlush()); + } else { + EXPECT_FALSE(commandStreamReceiver.checkPlatformSupportsGpuIdleImplicitFlush()); + } +} + +HWTEST2_F(CommandStreamReceiverHwTestXeHPPlus, givenXE_HP_COREDefaultSupportDisabledWhenOsSupportsGpuIdleImplicitFlushThenReturnOsSupportValue, IsXeHpCore) { + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(*osContext); + + VariableBackup defaultSettingForGpuIdleBackup(&ImplicitFlushSettings::getSettingForGpuIdle(), false); + + EXPECT_FALSE(commandStreamReceiver.checkPlatformSupportsGpuIdleImplicitFlush()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenPlatformSupportsImplicitFlushForIdleGpuWhenCsrIsMultiContextThenExpectNoSupport) { + VariableBackup defaultSettingForGpuIdleBackup(&ImplicitFlushSettings::getSettingForGpuIdle(), true); + + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(*osContext); + + commandStreamReceiver.multiOsContextCapable = true; + + EXPECT_TRUE(ImplicitFlushSettings::getSettingForGpuIdle()); + EXPECT_FALSE(commandStreamReceiver.checkPlatformSupportsGpuIdleImplicitFlush()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, givenPlatformSupportsImplicitFlushForIdleGpuWhenCsrIsMultiContextAndDirectSubmissionActiveThenExpectSupportTrue) { + VariableBackup defaultSettingForGpuIdleBackup(&ImplicitFlushSettings::getSettingForGpuIdle(), true); + VariableBackup backupOsSettingForGpuIdle(&OSInterface::gpuIdleImplicitFlush, true); + + osContext->setDirectSubmissionActive(); + + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + commandStreamReceiver.setupContext(*osContext); + + commandStreamReceiver.multiOsContextCapable = true; + + EXPECT_TRUE(ImplicitFlushSettings::getSettingForGpuIdle()); + EXPECT_TRUE(commandStreamReceiver.checkPlatformSupportsGpuIdleImplicitFlush()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTestXeHPPlus, whenCreatingWorkPartitionAllocationThenItsPropertiesAreCorrect) { + DebugManagerStateRestore restore{}; + DebugManager.flags.EnableStaticPartitioning.set(1); + DebugManager.flags.EnableLocalMemory.set(1); + UltDeviceFactory deviceFactory{1, 2}; + MockDevice &rootDevice = *deviceFactory.rootDevices[0]; + CommandStreamReceiver &csr = rootDevice.getGpgpuCommandStreamReceiver(); + + StorageInfo workPartitionAllocationStorageInfo = csr.getWorkPartitionAllocation()->storageInfo; + EXPECT_EQ(rootDevice.getDeviceBitfield(), workPartitionAllocationStorageInfo.memoryBanks); + EXPECT_EQ(rootDevice.getDeviceBitfield(), workPartitionAllocationStorageInfo.pageTablesVisibility); + EXPECT_FALSE(workPartitionAllocationStorageInfo.cloningOfPageTables); + EXPECT_TRUE(workPartitionAllocationStorageInfo.tileInstanced); +} + +HWTEST2_F(CommandStreamReceiverHwTestXeHPPlus, givenXeHpWhenRayTracingEnabledThenDoNotAddCommandBatchBuffer, IsXEHP) { + + MockCsrHw commandStreamReceiver(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + auto cmdSize = commandStreamReceiver.getCmdSizeForPerDssBackedBuffer(pDevice->getHardwareInfo()); + EXPECT_EQ(0u, cmdSize); + std::unique_ptr buffer(new char[cmdSize]); + + LinearStream cs(buffer.get(), cmdSize); + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.usePerDssBackedBuffer = true; + + commandStreamReceiver.programPerDssBackedBuffer(cs, *pDevice, dispatchFlags); + EXPECT_EQ(0u, cs.getUsed()); +} diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_simulated_common_hw_tests_xehp_plus.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_simulated_common_hw_tests_xehp_plus.cpp new file mode 100644 index 0000000000..2fd53b7bc1 --- /dev/null +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_simulated_common_hw_tests_xehp_plus.cpp @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/aub/aub_helper.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" + +#include "opencl/source/command_stream/command_stream_receiver_simulated_common_hw.h" +#include "opencl/source/helpers/hardware_context_controller.h" +#include "opencl/test/unit_test/fixtures/cl_device_fixture.h" +#include "opencl/test/unit_test/mocks/mock_aub_stream.h" +#include "opencl/test/unit_test/mocks/mock_csr_simulated_common_hw.h" +#include "test.h" + +using XeHPPlusMockSimulatedCsrHwTests = Test; + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusMockSimulatedCsrHwTests, givenLocalMemoryEnabledWhenGlobalMmiosAreInitializedThenLmemIsInitializedAndLmemCfgMmioIsWritten) { + std::unique_ptr> csrSimulatedCommonHw(new MockSimulatedCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + csrSimulatedCommonHw->localMemoryEnabled = true; + + auto stream = std::make_unique(); + csrSimulatedCommonHw->stream = stream.get(); + csrSimulatedCommonHw->initGlobalMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00101010, 0x00000080u))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000cf58, 0x80000000u))); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusMockSimulatedCsrHwTests, givenAUBDumpForceAllToLocalMemoryWhenGlobalMmiosAreInitializedThenLmemIsInitializedAndLmemCfgMmioIsWritten) { + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(true); + + std::unique_ptr> csrSimulatedCommonHw(new MockSimulatedCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + + auto stream = std::make_unique(); + csrSimulatedCommonHw->stream = stream.get(); + csrSimulatedCommonHw->initGlobalMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00101010, 0x00000080u))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000cf58, 0x80000000u))); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusMockSimulatedCsrHwTests, givenAubCommandStreamReceiverWhenGlobalMmiosAreInitializedThenMOCSRegistersAreConfigured) { + MockSimulatedCsrHw csrSimulatedCommonHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + + auto stream = std::make_unique(); + csrSimulatedCommonHw.stream = stream.get(); + + csrSimulatedCommonHw.initGlobalMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004000, 0x00000008))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004004, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004008, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000400C, 0x00000008))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004010, 0x00000018))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004014, 0x00060038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004018, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000401C, 0x00000033))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004020, 0x00060037))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004024, 0x0000003B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004028, 0x00000032))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000402C, 0x00000036))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004030, 0x0000003A))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004034, 0x00000033))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004038, 0x00000037))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000403C, 0x0000003B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004040, 0x00000030))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004044, 0x00000034))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004048, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000404C, 0x00000031))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004050, 0x00000032))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004054, 0x00000036))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004058, 0x0000003A))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000405C, 0x00000033))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004060, 0x00000037))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004064, 0x0000003B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004068, 0x00000032))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000406C, 0x00000036))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004070, 0x0000003A))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004074, 0x00000033))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004078, 0x00000037))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000407C, 0x0000003B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004080, 0x00000030))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004084, 0x00000034))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004088, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000408C, 0x00000031))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004090, 0x00000032))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004094, 0x00000036))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004098, 0x0000003A))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000409C, 0x00000033))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040A0, 0x00000037))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040A4, 0x0000003B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040A8, 0x00000032))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040AC, 0x00000036))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040B0, 0x0000003A))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040B4, 0x00000033))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040B8, 0x00000037))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040BC, 0x0000003B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040C0, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040C4, 0x00000034))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040C8, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040CC, 0x00000031))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040D0, 0x00000032))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040D4, 0x00000036))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040D8, 0x0000003A))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040DC, 0x00000033))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040E0, 0x00000037))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040E4, 0x0000003B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040E8, 0x00000032))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040EC, 0x00000036))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040F0, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040F4, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040F8, 0x00000038))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x000040FC, 0x00000038))); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusMockSimulatedCsrHwTests, givenAubCommandStreamReceiverWhenGlobalMmiosAreInitializedThenLNCFRegistersAreConfigured) { + MockSimulatedCsrHw csrSimulatedCommonHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + + auto stream = std::make_unique(); + csrSimulatedCommonHw.stream = stream.get(); + + csrSimulatedCommonHw.initGlobalMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B020, 0x00300010))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B024, 0x00300010))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B028, 0x00300030))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B02C, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B030, 0x0030001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B034, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B038, 0x0000001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B03C, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B040, 0x00100000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B044, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B048, 0x0010001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B04C, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B050, 0x0030001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B054, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B058, 0x0000001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B05C, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B060, 0x00100000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B064, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B068, 0x0010001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B06C, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B070, 0x0030001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B074, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B078, 0x0000001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B07C, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B080, 0x00300030))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B084, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B088, 0x0010001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B08C, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B090, 0x0030001F))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B094, 0x00170013))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B098, 0x00300010))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B09C, 0x00300010))); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusMockSimulatedCsrHwTests, givenAubCommandStreamReceiverWhenGlobalMmiosAreInitializedThenPerfMmioRegistersAreConfigured) { + MockSimulatedCsrHw csrSimulatedCommonHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + + auto stream = std::make_unique(); + csrSimulatedCommonHw.stream = stream.get(); + + csrSimulatedCommonHw.initGlobalMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B004, 0x2FC0100B))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000B404, 0x00000160))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00008708, 0x00000000))); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusMockSimulatedCsrHwTests, givenAubCommandStreamReceiverWhenGlobalMmiosAreInitializedThenTRTTRegistersAreConfigured) { + MockSimulatedCsrHw csrSimulatedCommonHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + + auto stream = std::make_unique(); + csrSimulatedCommonHw.stream = stream.get(); + + csrSimulatedCommonHw.initGlobalMMIO(); + + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004410, 0xffffffff))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004414, 0xfffffffe))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004404, 0x000000ff))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004408, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x0000440C, 0x00000000))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004400, 0x00000001))); + EXPECT_TRUE(stream->isOnMmioList(MMIOPair(0x00004DFC, 0x00000000))); +} + +class XeHPPlusTileRangeRegisterTest : public ClDeviceFixture, public ::testing::Test { + public: + template + void setUpImpl() { + hardwareInfo = *defaultHwInfo; + hardwareInfoSetup[hardwareInfo.platform.eProductFamily](&hardwareInfo, true, 0); + hardwareInfo.gtSystemInfo.MultiTileArchInfo.IsValid = true; + ClDeviceFixture::SetUpImpl(&hardwareInfo); + } + + void SetUp() override { + } + + void TearDown() override { + ClDeviceFixture::TearDown(); + } + + void checkMMIOs(MMIOList &list, uint32_t tilesNumber, uint32_t localMemorySizeTotalInGB) { + const uint32_t numberOfTiles = tilesNumber; + const uint32_t totalLocalMemorySizeGB = localMemorySizeTotalInGB; + + MMIOPair tileAddrRegisters[] = {{0x00004900, 0x0001}, + {0x00004904, 0x0001}, + {0x00004908, 0x0001}, + {0x0000490c, 0x0001}}; + + uint32_t localMemoryBase = 0x0; + for (uint32_t i = 0; i < sizeof(tileAddrRegisters) / sizeof(MMIOPair); i++) { + tileAddrRegisters[i].second |= localMemoryBase << 1; + tileAddrRegisters[i].second |= (totalLocalMemorySizeGB / numberOfTiles) << 8; + localMemoryBase += (totalLocalMemorySizeGB / numberOfTiles); + } + + uint32_t mmiosFound = 0; + for (auto &mmioPair : list) { + for (uint32_t i = 0; i < numberOfTiles; i++) { + if (mmioPair.first == tileAddrRegisters[i].first && mmioPair.second == tileAddrRegisters[i].second) { + mmiosFound++; + } + } + } + EXPECT_EQ(numberOfTiles, mmiosFound); + } +}; + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTileRangeRegisterTest, givenLocalMemoryEnabledWhenGlobalMmiosAreInitializedThenTileRangeRegistersAreProgrammed) { + setUpImpl(); + std::unique_ptr> csrSimulatedCommonHw(new MockSimulatedCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + csrSimulatedCommonHw->localMemoryEnabled = true; + + auto stream = std::make_unique(); + csrSimulatedCommonHw->stream = stream.get(); + csrSimulatedCommonHw->initGlobalMMIO(); + + checkMMIOs(stream->mmioList, 1, 32); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTileRangeRegisterTest, givenLocalMemoryEnabledAnd4TileConfigWhenGlobalMmiosAreInitializedThenTileRangeRegistersAreProgrammed) { + DebugManagerStateRestore restorer; + DebugManager.flags.CreateMultipleSubDevices.set(4); + setUpImpl(); + std::unique_ptr> csrSimulatedCommonHw(new MockSimulatedCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + csrSimulatedCommonHw->localMemoryEnabled = true; + + auto stream = std::make_unique(); + csrSimulatedCommonHw->stream = stream.get(); + csrSimulatedCommonHw->initGlobalMMIO(); + + checkMMIOs(stream->mmioList, 4, 32); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTileRangeRegisterTest, givenAUBDumpForceAllToLocalMemoryWhenGlobalMmiosAreInitializedThenTileRangeRegistersAreProgrammed) { + setUpImpl(); + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(true); + + std::unique_ptr> csrSimulatedCommonHw(new MockSimulatedCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())); + csrSimulatedCommonHw->localMemoryEnabled = true; + + auto stream = std::make_unique(); + csrSimulatedCommonHw->stream = stream.get(); + csrSimulatedCommonHw->initGlobalMMIO(); + + checkMMIOs(stream->mmioList, 1, 32); +} diff --git a/opencl/test/unit_test/command_stream/compute_mode_tests_xehp_plus.cpp b/opencl/test/unit_test/command_stream/compute_mode_tests_xehp_plus.cpp new file mode 100644 index 0000000000..f10c56cfc9 --- /dev/null +++ b/opencl/test/unit_test/command_stream/compute_mode_tests_xehp_plus.cpp @@ -0,0 +1,426 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/ptr_math.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/mocks/mock_device.h" + +#include "opencl/test/unit_test/command_stream/compute_mode_tests.h" +#include "opencl/test/unit_test/mocks/mock_allocation_properties.h" +#include "test.h" + +#include "test_traits_common.h" + +using namespace NEO; + +HWCMDTEST_F(IGFX_XE_HP_CORE, ComputeModeRequirements, givenCoherencyWithoutSharedHandlesWhenCommandSizeIsCalculatedThenCorrectCommandSizeIsReturned) { + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + SetUpImpl(); + + getCsrHw()->requiredThreadArbitrationPolicy = getCsrHw()->lastSentThreadArbitrationPolicy; + auto cmdsSize = sizeof(STATE_COMPUTE_MODE); + + overrideComputeModeRequest(false, false, false); + auto retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(0u, retSize); + + overrideComputeModeRequest(false, true, false); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(0u, retSize); + + overrideComputeModeRequest(true, true, false); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdsSize, retSize); + + overrideComputeModeRequest(true, false, false); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdsSize, retSize); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ComputeModeRequirements, givenCoherencyWithSharedHandlesWhenCommandSizeIsCalculatedThenCorrectCommandSizeIsReturned) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto cmdsSize = sizeof(STATE_COMPUTE_MODE) + sizeof(PIPE_CONTROL); + + overrideComputeModeRequest(false, false, true); + auto retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdsSize, retSize); + + overrideComputeModeRequest(false, true, true); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdsSize, retSize); + + overrideComputeModeRequest(true, true, true); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdsSize, retSize); + + overrideComputeModeRequest(true, false, true); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdsSize, retSize); +} + +struct ForceNonCoherentSupportedMatcher { + template + static constexpr bool isMatched() { + if constexpr (HwMapper::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) { + return TestTraits::get()>::forceNonCoherentSupported; + } + return false; + } +}; + +HWTEST2_F(ComputeModeRequirements, givenCoherencyWithoutSharedHandlesWhenComputeModeIsProgrammedThenCorrectCommandsAreAdded, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + + auto cmdsSize = sizeof(STATE_COMPUTE_MODE); + char buff[1024] = {0}; + LinearStream stream(buff, 1024); + + auto expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT); + auto expectedBitsMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + overrideComputeModeRequest(true, false, false, false); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize, stream.getUsed()); + + auto scmCmd = reinterpret_cast(stream.getCpuBase()); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); + + auto startOffset = stream.getUsed(); + + overrideComputeModeRequest(true, true, false, false); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize * 2, stream.getUsed()); + + expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_DISABLED); + scmCmd = reinterpret_cast(ptrOffset(stream.getCpuBase(), startOffset)); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); +} + +HWTEST2_F(ComputeModeRequirements, givenCoherencyWithSharedHandlesWhenComputeModeIsProgrammedThenCorrectCommandsAreAdded, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto cmdsSize = sizeof(STATE_COMPUTE_MODE) + sizeof(PIPE_CONTROL); + char buff[1024] = {0}; + LinearStream stream(buff, 1024); + + auto expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT); + auto expectedBitsMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + auto expectedPcCmd = FamilyType::cmdInitPipeControl; + + overrideComputeModeRequest(true, false, true, false); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize, stream.getUsed()); + + auto scmCmd = reinterpret_cast(stream.getCpuBase()); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); + + auto pcCmd = reinterpret_cast(ptrOffset(stream.getCpuBase(), sizeof(STATE_COMPUTE_MODE))); + EXPECT_TRUE(memcmp(&expectedPcCmd, pcCmd, sizeof(PIPE_CONTROL)) == 0); + + auto startOffset = stream.getUsed(); + + overrideComputeModeRequest(true, true, true, false); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize * 2, stream.getUsed()); + + expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_DISABLED); + scmCmd = reinterpret_cast(ptrOffset(stream.getCpuBase(), startOffset)); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); + + pcCmd = reinterpret_cast(ptrOffset(stream.getCpuBase(), startOffset + sizeof(STATE_COMPUTE_MODE))); + EXPECT_TRUE(memcmp(&expectedPcCmd, pcCmd, sizeof(PIPE_CONTROL)) == 0); +} + +HWTEST2_F(ComputeModeRequirements, givenCoherencyRequirementWithoutSharedHandlesWhenFlushTaskCalledThenProgramCmdOnlyIfChanged, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto startOffset = getCsrHw()->commandStream.getUsed(); + + auto graphicAlloc = csr->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); + IndirectHeap stream(graphicAlloc); + + auto flushTask = [&](bool coherencyRequired) { + getCsrHw()->lastSentThreadArbitrationPolicy = getCsrHw()->requiredThreadArbitrationPolicy; + flags.requiresCoherency = coherencyRequired; + startOffset = getCsrHw()->commandStream.getUsed(); + csr->flushTask(stream, 0, stream, stream, stream, 0, flags, *device); + }; + + auto findCmd = [&](bool expectToBeProgrammed, bool expectCoherent) { + HardwareParse hwParser; + hwParser.parseCommands(getCsrHw()->commandStream, startOffset); + bool foundOne = false; + + typename STATE_COMPUTE_MODE::FORCE_NON_COHERENT expectedCoherentValue = expectCoherent ? STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_DISABLED : STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT; + uint32_t expectedCoherentMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) { + auto cmd = genCmdCast(*it); + if (cmd) { + EXPECT_EQ(expectedCoherentValue, cmd->getForceNonCoherent()); + EXPECT_TRUE(isValueSet(cmd->getMaskBits(), expectedCoherentMask)); + EXPECT_FALSE(foundOne); + foundOne = true; + auto pc = genCmdCast(*(++it)); + EXPECT_EQ(nullptr, pc); + } + } + EXPECT_EQ(expectToBeProgrammed, foundOne); + }; + + flushTask(false); + findCmd(true, false); // first time + + flushTask(false); + findCmd(false, false); // not changed + + flushTask(true); + findCmd(true, true); // changed + + flushTask(true); + findCmd(false, true); // not changed + + flushTask(false); + findCmd(true, false); // changed + + flushTask(false); + findCmd(false, false); // not changed + csr->getMemoryManager()->freeGraphicsMemory(graphicAlloc); +} + +HWTEST2_F(ComputeModeRequirements, givenCoherencyRequirementWithSharedHandlesWhenFlushTaskCalledThenAlwaysProgramCmds, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto startOffset = getCsrHw()->commandStream.getUsed(); + auto graphicsAlloc = csr->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); + IndirectHeap stream(graphicsAlloc); + + auto flushTask = [&](bool coherencyRequired) { + getCsrHw()->lastSentThreadArbitrationPolicy = getCsrHw()->requiredThreadArbitrationPolicy; + flags.requiresCoherency = coherencyRequired; + makeResidentSharedAlloc(); + + startOffset = getCsrHw()->commandStream.getUsed(); + csr->flushTask(stream, 0, stream, stream, stream, 0, flags, *device); + }; + + auto flushTaskAndFindCmds = [&](bool expectCoherent) { + flushTask(expectCoherent); + HardwareParse hwParser; + hwParser.parseCommands(getCsrHw()->commandStream, startOffset); + bool foundOne = false; + + typename STATE_COMPUTE_MODE::FORCE_NON_COHERENT expectedCoherentValue = expectCoherent ? STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_DISABLED : STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT; + uint32_t expectedCoherentMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) { + auto cmd = genCmdCast(*it); + if (cmd) { + EXPECT_EQ(expectedCoherentValue, cmd->getForceNonCoherent()); + EXPECT_TRUE(isValueSet(cmd->getMaskBits(), expectedCoherentMask)); + EXPECT_FALSE(foundOne); + foundOne = true; + auto pc = genCmdCast(*(++it)); + EXPECT_NE(nullptr, pc); + } + } + EXPECT_TRUE(foundOne); + }; + + flushTaskAndFindCmds(false); // first time + flushTaskAndFindCmds(false); // not changed + flushTaskAndFindCmds(true); // changed + flushTaskAndFindCmds(true); // not changed + flushTaskAndFindCmds(false); // changed + flushTaskAndFindCmds(false); // not changed + + csr->getMemoryManager()->freeGraphicsMemory(graphicsAlloc); +} + +HWTEST2_F(ComputeModeRequirements, givenFlushWithoutSharedHandlesWhenPreviouslyUsedThenProgramPcAndSCM, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto graphicAlloc = csr->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); + IndirectHeap stream(graphicAlloc); + + makeResidentSharedAlloc(); + csr->flushTask(stream, 0, stream, stream, stream, 0, flags, *device); + EXPECT_TRUE(getCsrHw()->getCsrRequestFlags()->hasSharedHandles); + auto startOffset = getCsrHw()->commandStream.getUsed(); + + csr->flushTask(stream, 0, stream, stream, stream, 0, flags, *device); + EXPECT_TRUE(getCsrHw()->getCsrRequestFlags()->hasSharedHandles); + + HardwareParse hwParser; + hwParser.parseCommands(getCsrHw()->commandStream, startOffset); + + typename STATE_COMPUTE_MODE::FORCE_NON_COHERENT expectedCoherentValue = STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT; + uint32_t expectedCoherentMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + bool foundOne = false; + for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) { + auto cmd = genCmdCast(*it); + if (cmd) { + EXPECT_EQ(expectedCoherentValue, cmd->getForceNonCoherent()); + EXPECT_TRUE(isValueSet(cmd->getMaskBits(), expectedCoherentMask)); + EXPECT_FALSE(foundOne); + foundOne = true; + auto pc = genCmdCast(*(++it)); + EXPECT_NE(nullptr, pc); + } + } + EXPECT_TRUE(foundOne); + + csr->getMemoryManager()->freeGraphicsMemory(graphicAlloc); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ComputeModeRequirements, givenComputeModeCmdSizeWhenLargeGrfModeChangeIsRequiredThenSCMCommandSizeIsCalculated) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto cmdSize = 0u; + + overrideComputeModeRequest(false, false, false, false, 128u); + auto retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdSize, retSize); + + cmdSize = sizeof(STATE_COMPUTE_MODE); + + overrideComputeModeRequest(false, false, false, true, 256u); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdSize, retSize); + + overrideComputeModeRequest(true, false, false, true, 256u); + retSize = getCsrHw()->getCmdSizeForComputeMode(); + EXPECT_EQ(cmdSize, retSize); +} + +HWTEST2_F(ComputeModeRequirements, givenComputeModeProgrammingWhenLargeGrfModeChangeIsRequiredThenCorrectCommandsAreAdded, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto cmdsSize = sizeof(STATE_COMPUTE_MODE); + char buff[1024]; + LinearStream stream(buff, 1024); + + auto expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT); + auto expectedBitsMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + expectedScmCmd.setLargeGrfMode(true); + + overrideComputeModeRequest(false, false, false, true, 256u); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize, stream.getUsed()); + + auto scmCmd = reinterpret_cast(stream.getCpuBase()); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); + + auto startOffset = stream.getUsed(); + + overrideComputeModeRequest(false, false, false, true, 128u); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize * 2, stream.getUsed()); + + expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setLargeGrfMode(false); + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT); + scmCmd = reinterpret_cast(ptrOffset(stream.getCpuBase(), startOffset)); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ComputeModeRequirements, givenComputeModeProgrammingWhenLargeGrfModeDoesntChangeThenSCMIsNotAdded) { + SetUpImpl(); + + char buff[1024]; + LinearStream stream(buff, 1024); + + overrideComputeModeRequest(false, false, false, false, 256u); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(0u, stream.getUsed()); +} + +HWTEST2_F(ComputeModeRequirements, givenComputeModeProgrammingWhenRequiredGRFNumberIsLowerThan128ThenSmallGRFModeIsProgrammed, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + + auto cmdsSize = sizeof(STATE_COMPUTE_MODE); + char buff[1024]; + LinearStream stream(buff, 1024); + + auto expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setLargeGrfMode(false); + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT); + auto expectedBitsMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + overrideComputeModeRequest(false, false, false, true, 127u); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize, stream.getUsed()); + + auto scmCmd = reinterpret_cast(stream.getCpuBase()); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); +} + +HWTEST2_F(ComputeModeRequirements, givenComputeModeProgrammingWhenRequiredGRFNumberIsGreaterThan128ThenLargeGRFModeIsProgrammed, ForceNonCoherentSupportedMatcher) { + SetUpImpl(); + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto cmdsSize = sizeof(STATE_COMPUTE_MODE); + char buff[1024]; + LinearStream stream(buff, 1024); + + auto expectedScmCmd = FamilyType::cmdInitStateComputeMode; + expectedScmCmd.setForceNonCoherent(STATE_COMPUTE_MODE::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT); + expectedScmCmd.setLargeGrfMode(true); + auto expectedBitsMask = FamilyType::stateComputeModeForceNonCoherentMask | FamilyType::stateComputeModeLargeGrfModeMask; + + getCsrHw()->requiredThreadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent; + + overrideComputeModeRequest(false, false, false, true, 256u); + getCsrHw()->programComputeMode(stream, flags, *defaultHwInfo); + EXPECT_EQ(cmdsSize, stream.getUsed()); + + auto scmCmd = reinterpret_cast(stream.getCpuBase()); + EXPECT_TRUE(isValueSet(scmCmd->getMaskBits(), expectedBitsMask)); + expectedScmCmd.setMaskBits(scmCmd->getMaskBits()); + EXPECT_TRUE(memcmp(&expectedScmCmd, scmCmd, sizeof(STATE_COMPUTE_MODE)) == 0); +} diff --git a/opencl/test/unit_test/command_stream/get_devices_tests.cpp b/opencl/test/unit_test/command_stream/get_devices_tests.cpp index 41eb97504d..40c4cacf9b 100644 --- a/opencl/test/unit_test/command_stream/get_devices_tests.cpp +++ b/opencl/test/unit_test/command_stream/get_devices_tests.cpp @@ -192,4 +192,20 @@ HWTEST_F(PrepareDeviceEnvironmentsTest, givenPrepareDeviceEnvironmentsAndUnknown } } } + +TEST(MultiDeviceTests, givenCreateMultipleRootDevicesAndLimitAmountOfReturnedDevicesFlagWhenClGetDeviceIdsIsCalledThenLowerValueIsReturned) { + platformsImpl->clear(); + VariableBackup backup(&ultHwConfig); + ultHwConfig.useHwCsr = true; + ultHwConfig.forceOsAgnosticMemoryManager = false; + ultHwConfig.useMockedPrepareDeviceEnvironmentsFunc = false; + DebugManagerStateRestore stateRestore; + DebugManager.flags.CreateMultipleRootDevices.set(2); + DebugManager.flags.LimitAmountOfReturnedDevices.set(1); + cl_uint numDevices = 0; + + auto retVal = clGetDeviceIDs(nullptr, CL_DEVICE_TYPE_GPU, 0, nullptr, &numDevices); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, numDevices); +} } // namespace NEO diff --git a/opencl/test/unit_test/command_stream/implicit_scaling_ocl_tests.cpp b/opencl/test/unit_test/command_stream/implicit_scaling_ocl_tests.cpp new file mode 100644 index 0000000000..e18ef56c89 --- /dev/null +++ b/opencl/test/unit_test/command_stream/implicit_scaling_ocl_tests.cpp @@ -0,0 +1,16 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/implicit_scaling.h" + +#include "test.h" + +using namespace NEO; + +TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenSupportEnabled) { + EXPECT_TRUE(ImplicitScaling::apiSupport); +} diff --git a/opencl/test/unit_test/command_stream/tbx_command_stream_receiver_tests_xehp_plus.cpp b/opencl/test/unit_test/command_stream/tbx_command_stream_receiver_tests_xehp_plus.cpp new file mode 100644 index 0000000000..8cd9bef52c --- /dev/null +++ b/opencl/test/unit_test/command_stream/tbx_command_stream_receiver_tests_xehp_plus.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_stream/tbx_command_stream_receiver_hw.h" +#include "shared/source/memory_manager/memory_banks.h" +#include "shared/source/memory_manager/memory_pool.h" +#include "shared/source/memory_manager/physical_address_allocator.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/mocks/mock_graphics_allocation.h" + +#include "opencl/test/unit_test/fixtures/cl_device_fixture.h" +#include "opencl/test/unit_test/helpers/hw_helper_tests.h" +#include "test.h" + +using namespace NEO; + +struct XeHPPlusTbxCommandStreamReceiverTests : ClDeviceFixture, ::testing::Test { + template + void setUpImpl() { + hardwareInfo = *defaultHwInfo; + hardwareInfoSetup[hardwareInfo.platform.eProductFamily](&hardwareInfo, true, 0); + hardwareInfo.gtSystemInfo.MultiTileArchInfo.IsValid = true; + ClDeviceFixture::SetUpImpl(&hardwareInfo); + } + + void SetUp() override { + } + + void TearDown() override { + ClDeviceFixture::TearDown(); + } +}; + +template +struct MockTbxCommandStreamReceiverHw : TbxCommandStreamReceiverHw { + using TbxCommandStreamReceiverHw::TbxCommandStreamReceiverHw; + + uint32_t getDeviceIndex() const override { + return deviceIndex; + } + + uint32_t deviceIndex = 0u; +}; + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, givenNullPtrGraphicsAlloctionWhenGetPPGTTAdditionalBitsIsCalledThenAppropriateValueIsReturned) { + setUpImpl(); + auto tbxCsr = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + GraphicsAllocation *allocation = nullptr; + auto bits = tbxCsr->getPPGTTAdditionalBits(allocation); + + EXPECT_EQ(3u, bits); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, givenGraphicsAlloctionWithNonLocalMemoryPoolWhenGetPPGTTAdditionalBitsIsCalledThenAppropriateValueIsReturned) { + setUpImpl(); + auto tbxCsr = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + MockGraphicsAllocation allocation(nullptr, 0); + auto bits = tbxCsr->getPPGTTAdditionalBits(&allocation); + + EXPECT_EQ(3u, bits); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, givenGraphicsAlloctionWithLocalMemoryPoolWhenGetPPGTTAdditionalBitsIsCalledThenAppropriateValueIsReturned) { + setUpImpl(); + auto tbxCsr = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + MockGraphicsAllocation allocation(nullptr, 0); + allocation.overrideMemoryPool(MemoryPool::LocalMemory); + auto bits = tbxCsr->getPPGTTAdditionalBits(&allocation); + + EXPECT_EQ(3u | (1 << 11), bits); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, givenAubDumpForceAllToLocalMemoryPoolWhenGetPPGTTAdditionalBitsIsCalledThenLocalBitIsReturned) { + setUpImpl(); + DebugManagerStateRestore debugRestorer; + DebugManager.flags.AUBDumpForceAllToLocalMemory.set(true); + + auto tbxCsr = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + MockGraphicsAllocation allocation(nullptr, 0); + + auto bits = tbxCsr->getPPGTTAdditionalBits(&allocation); + + EXPECT_EQ(3u | (1 << 11), bits); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, givenLocalMemoryFeatureWhenGetGTTDataIsCalledThenLocalMemoryIsSet) { + setUpImpl(); + DebugManagerStateRestore debugRestorer; + DebugManager.flags.EnableLocalMemory.set(1); + hardwareInfo.featureTable.ftrLocalMemory = true; + + std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo)); + auto tbxCsr = std::make_unique>(*device->executionEnvironment, device->getRootDeviceIndex(), device->getDeviceBitfield()); + tbxCsr->setupContext(*device->getDefaultEngine().osContext); + + AubGTTData data = {false, false}; + tbxCsr->getGTTData(nullptr, data); + EXPECT_TRUE(data.localMemory); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, givenLocalMemoryEnabledWhenGetMemoryBankForGttIsCalledThenCorrectBankForDeviceIsReturned) { + setUpImpl(); + DebugManagerStateRestore debugRestorer; + DebugManager.flags.EnableLocalMemory.set(1); + hardwareInfo.featureTable.ftrLocalMemory = true; + + std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo)); + auto tbxCsr = std::make_unique>(*device->executionEnvironment, device->getRootDeviceIndex(), device->getDeviceBitfield()); + + auto bank = tbxCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(0), bank); + + tbxCsr->deviceIndex = 1u; + bank = tbxCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(1), bank); + + tbxCsr->deviceIndex = 2u; + bank = tbxCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(2), bank); + + tbxCsr->deviceIndex = 3u; + bank = tbxCsr->getMemoryBankForGtt(); + EXPECT_EQ(MemoryBanks::getBankForLocalMemory(3), bank); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, whenPhysicalAllocatorIsCreatedThenItHasCorrectBankSzieAndNumberOfBanks) { + setUpImpl(); + std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo)); + auto tbxCsr = std::make_unique>(*device->executionEnvironment, device->getRootDeviceIndex(), device->getDeviceBitfield()); + auto physicalAddressAllocator = tbxCsr->physicalAddressAllocator.get(); + auto allocator = reinterpret_cast *>(physicalAddressAllocator); + + EXPECT_EQ(32 * MemoryConstants::gigaByte, allocator->getBankSize()); + EXPECT_EQ(1u, allocator->getNumberOfBanks()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, whenPhysicalAllocatorIsCreatedFor4TilesThenItHasCorrectBankSzieAndNumberOfBanks) { + DebugManagerStateRestore restorer; + DebugManager.flags.CreateMultipleSubDevices.set(4); + setUpImpl(); + std::unique_ptr device(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo)); + auto tbxCsr = std::make_unique>(*device->executionEnvironment, device->getRootDeviceIndex(), device->getDeviceBitfield()); + auto physicalAddressAllocator = tbxCsr->physicalAddressAllocator.get(); + auto allocator = reinterpret_cast *>(physicalAddressAllocator); + + EXPECT_EQ(8 * MemoryConstants::gigaByte, allocator->getBankSize()); + EXPECT_EQ(4u, allocator->getNumberOfBanks()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPPlusTbxCommandStreamReceiverTests, whenAskedForPollForCompletionParametersThenReturnCorrectValues) { + setUpImpl(); + class MyMockTbxHw : public TbxCommandStreamReceiverHw { + public: + MyMockTbxHw(ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield) + : TbxCommandStreamReceiverHw(executionEnvironment, 0, deviceBitfield) {} + using TbxCommandStreamReceiverHw::getpollNotEqualValueForPollForCompletion; + using TbxCommandStreamReceiverHw::getMaskAndValueForPollForCompletion; + }; + + MyMockTbxHw myMockTbxHw(*pDevice->executionEnvironment, pDevice->getDeviceBitfield()); + EXPECT_EQ(0x80u, myMockTbxHw.getMaskAndValueForPollForCompletion()); + EXPECT_TRUE(myMockTbxHw.getpollNotEqualValueForPollForCompletion()); +} diff --git a/opencl/test/unit_test/context/context_tests.cpp b/opencl/test/unit_test/context/context_tests.cpp index 8263d40842..3618117422 100644 --- a/opencl/test/unit_test/context/context_tests.cpp +++ b/opencl/test/unit_test/context/context_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/device/device.h" +#include "shared/source/helpers/blit_commands_helper.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/variable_backup.h" #include "shared/test/common/mocks/mock_deferred_deleter.h" @@ -14,6 +15,7 @@ #include "opencl/source/command_queue/command_queue.h" #include "opencl/source/context/context.inl" #include "opencl/source/device_queue/device_queue.h" +#include "opencl/source/mem_obj/buffer.h" #include "opencl/source/sharings/sharing.h" #include "opencl/test/unit_test/fixtures/platform_fixture.h" #include "opencl/test/unit_test/mocks/mock_cl_device.h" @@ -22,8 +24,7 @@ #include "opencl/test/unit_test/mocks/mock_memory_manager.h" #include "opencl/test/unit_test/mocks/mock_platform.h" #include "opencl/test/unit_test/test_macros/test_checks_ocl.h" - -#include "gtest/gtest.h" +#include "test.h" using namespace NEO; @@ -513,3 +514,56 @@ TEST(Context, givenContextAndDevicesWhenIsTileOnlyThenProperValueReturned) { EXPECT_FALSE(subDevicesContext.isSingleDeviceContext()); EXPECT_FALSE(multipleDevicesContext.isSingleDeviceContext()); } + +TEST(InvalidExtraPropertiesTests, givenInvalidExtraPropertiesWhenCreatingContextThenContextIsNotCreated) { + constexpr cl_context_properties INVALID_PROPERTY_TYPE = (1 << 31); + constexpr cl_context_properties INVALID_CONTEXT_FLAG = (1 << 31); + + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); + cl_device_id deviceID = device.get(); + cl_int retVal = 0; + std::unique_ptr context; + + { + cl_context_properties properties[] = {INVALID_PROPERTY_TYPE, INVALID_CONTEXT_FLAG, 0}; + context.reset(Context::create(properties, ClDeviceVector(&deviceID, 1), nullptr, nullptr, retVal)); + EXPECT_EQ(CL_INVALID_PROPERTY, retVal); + EXPECT_EQ(nullptr, context.get()); + } +} + +using ContextCreateTests = ::testing::Test; + +HWCMDTEST_F(IGFX_XE_HP_CORE, ContextCreateTests, givenLocalMemoryAllocationWhenBlitMemoryToAllocationIsCalledThenSuccessIsReturned) { + if (is32bit) { + GTEST_SKIP(); + } + + DebugManagerStateRestore restore; + DebugManager.flags.EnableLocalMemory.set(true); + DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast(LocalMemoryAccessMode::Default)); + UltClDeviceFactory deviceFactory{1, 2}; + + ClDevice *devicesToTest[] = {deviceFactory.rootDevices[0], deviceFactory.subDevices[0], deviceFactory.subDevices[1]}; + + for (const auto &testedDevice : devicesToTest) { + + MockContext context(testedDevice); + cl_int retVal; + auto buffer = std::unique_ptr(Buffer::create(&context, {}, 1, nullptr, retVal)); + auto memory = buffer->getGraphicsAllocation(testedDevice->getRootDeviceIndex()); + uint8_t hostMemory[1]; + auto executionEnv = testedDevice->getExecutionEnvironment(); + executionEnv->rootDeviceEnvironments[0]->getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = false; + + const auto &hwInfo = testedDevice->getHardwareInfo(); + auto isBlitterRequired = HwHelper::get(hwInfo.platform.eRenderCoreFamily).isBlitCopyRequiredForLocalMemory(hwInfo, *memory); + + auto expectedStatus = isBlitterRequired ? BlitOperationResult::Success : BlitOperationResult::Unsupported; + + EXPECT_EQ(expectedStatus, BlitHelper::blitMemoryToAllocation(buffer->getContext()->getDevice(0)->getDevice(), memory, buffer->getOffset(), hostMemory, {1, 1, 1})); + + executionEnv->rootDeviceEnvironments[0]->getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = true; + EXPECT_EQ(BlitOperationResult::Success, BlitHelper::blitMemoryToAllocation(buffer->getContext()->getDevice(0)->getDevice(), memory, buffer->getOffset(), hostMemory, {1, 1, 1})); + } +} \ No newline at end of file diff --git a/opencl/test/unit_test/fixtures/CMakeLists.txt b/opencl/test/unit_test/fixtures/CMakeLists.txt index 6873ff4f79..8c0c0b7f2f 100644 --- a/opencl/test/unit_test/fixtures/CMakeLists.txt +++ b/opencl/test/unit_test/fixtures/CMakeLists.txt @@ -37,6 +37,7 @@ set(IGDRCL_SRCS_tests_fixtures ${CMAKE_CURRENT_SOURCE_DIR}/memory_manager_fixture.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory_manager_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/multi_root_device_fixture.h + ${CMAKE_CURRENT_SOURCE_DIR}/multi_tile_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/one_mip_level_image_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/platform_fixture.cpp ${CMAKE_CURRENT_SOURCE_DIR}/platform_fixture.h diff --git a/opencl/test/unit_test/fixtures/multi_tile_fixture.h b/opencl/test/unit_test/fixtures/multi_tile_fixture.h new file mode 100644 index 0000000000..ebe01edeb1 --- /dev/null +++ b/opencl/test/unit_test/fixtures/multi_tile_fixture.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/execution_environment/root_device_environment.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/default_hw_info.h" +#include "shared/test/common/helpers/ult_hw_config.h" +#include "shared/test/common/helpers/variable_backup.h" + +#include "opencl/source/platform/platform.h" +#include "opencl/test/unit_test/mocks/mock_platform.h" +#include "test.h" + +struct MultiTileFixture : public ::testing::Test { + void SetUp() override { + ultHwConfig.useMockedPrepareDeviceEnvironmentsFunc = false; + ultHwConfig.useHwCsr = true; + ultHwConfig.forceOsAgnosticMemoryManager = false; + DebugManager.flags.CreateMultipleSubDevices.set(requiredDeviceCount); + DebugManager.flags.DeferOsContextInitialization.set(0); + platformsImpl->clear(); + constructPlatform(); + initPlatform(); + }; + + protected: + VariableBackup backup{&ultHwConfig}; + DebugManagerStateRestore stateRestore; + cl_uint requiredDeviceCount = 2u; +}; + +struct FourTileFixture : public MultiTileFixture { + FourTileFixture() : MultiTileFixture() { requiredDeviceCount = 4; } +}; diff --git a/opencl/test/unit_test/mocks/CMakeLists.txt b/opencl/test/unit_test/mocks/CMakeLists.txt index 4fd618cf33..991999ba48 100644 --- a/opencl/test/unit_test/mocks/CMakeLists.txt +++ b/opencl/test/unit_test/mocks/CMakeLists.txt @@ -62,6 +62,12 @@ set(IGDRCL_SRCS_tests_mocks ${NEO_CORE_tests_compiler_mocks} ) +if(TESTS_XEHP_PLUS) + list(APPEND IGDRCL_SRCS_tests_mocks + ${CMAKE_CURRENT_SOURCE_DIR}/mock_scratch_space_controller_xehp_plus.h + ) +endif() + if(WIN32) file(GLOB IGDRCL_SRC_tests_mock_wddm "${CMAKE_CURRENT_SOURCE_DIR}/mock_wddm2[0-9]\.*") list(APPEND IGDRCL_SRCS_tests_mocks diff --git a/opencl/test/unit_test/mocks/mock_scratch_space_controller_xehp_plus.h b/opencl/test/unit_test/mocks/mock_scratch_space_controller_xehp_plus.h new file mode 100644 index 0000000000..b4583cd54a --- /dev/null +++ b/opencl/test/unit_test/mocks/mock_scratch_space_controller_xehp_plus.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2016-2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "shared/source/command_stream/scratch_space_controller_xehp_plus.h" + +namespace NEO { +struct MockScratchSpaceControllerXeHPPlus : public ScratchSpaceControllerXeHPPlus { + using ScratchSpaceControllerXeHPPlus::computeUnitsUsedForScratch; + using ScratchSpaceControllerXeHPPlus::getOffsetToSurfaceState; + using ScratchSpaceControllerXeHPPlus::perThreadScratchSize; + using ScratchSpaceControllerXeHPPlus::privateScratchAllocation; + using ScratchSpaceControllerXeHPPlus::privateScratchSizeBytes; + using ScratchSpaceControllerXeHPPlus::scratchAllocation; + using ScratchSpaceControllerXeHPPlus::scratchSizeBytes; + using ScratchSpaceControllerXeHPPlus::ScratchSpaceControllerXeHPPlus; + using ScratchSpaceControllerXeHPPlus::singleSurfaceStateSize; + using ScratchSpaceControllerXeHPPlus::slotId; + using ScratchSpaceControllerXeHPPlus::stateSlotsCount; + using ScratchSpaceControllerXeHPPlus::surfaceStateHeap; + using ScratchSpaceControllerXeHPPlus::updateSlots; +}; +} // namespace NEO diff --git a/shared/source/helpers/engine_node_helper.h b/shared/source/helpers/engine_node_helper.h index 5b2b70584d..e86a4ca029 100644 --- a/shared/source/helpers/engine_node_helper.h +++ b/shared/source/helpers/engine_node_helper.h @@ -32,6 +32,7 @@ bool isCcs(aub_stream::EngineType engineType); bool isBcs(aub_stream::EngineType engineType); aub_stream::EngineType getBcsEngineType(const HardwareInfo &hwInfo, SelectorCopyEngine &selectorCopyEngine, bool internalUsage = false); void releaseBcsEngineType(aub_stream::EngineType engineType, SelectorCopyEngine &selectorCopyEngine); +aub_stream::EngineType remapEngineTypeToHwSpecific(aub_stream::EngineType inputType, const HardwareInfo &hwInfo); std::string engineTypeToString(aub_stream::EngineType engineType); std::string engineTypeToStringAdditional(aub_stream::EngineType engineType); diff --git a/shared/source/helpers/engine_node_helper_extra.cpp b/shared/source/helpers/engine_node_helper_extra.cpp index 41bf5655a2..e3f87f432e 100644 --- a/shared/source/helpers/engine_node_helper_extra.cpp +++ b/shared/source/helpers/engine_node_helper_extra.cpp @@ -23,5 +23,9 @@ std::string engineTypeToStringAdditional(aub_stream::EngineType engineType) { return "Unknown"; } +aub_stream::EngineType remapEngineTypeToHwSpecific(aub_stream::EngineType inputType, const HardwareInfo &hwInfo) { + return inputType; +} + } // namespace EngineHelpers } // namespace NEO