From 82a6f9e7b2e9b83a538858c90a0c7f5a0ba4225a Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 12 Jul 2022 15:28:06 +0000 Subject: [PATCH] Use compute walker system fence for system memory or events in use Related-To: NEO-6959 Signed-off-by: Zbigniew Zdanowicz --- .../xe_hpc_core/test_cmdlist_xe_hpc_core.cpp | 89 ++++++- .../hardware_interface_xehp_and_later.inl | 3 +- ...d_stream_receiver_hw_tests_xe_hpc_core.cpp | 237 +++++++++++++++--- .../command_encoder_xehp_and_later.inl | 2 +- .../xe_hpc_core/test_encode_xe_hpc_core.cpp | 50 +++- 5 files changed, 331 insertions(+), 50 deletions(-) diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp index 9cd2fa0078..87e91023fc 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp @@ -321,7 +321,8 @@ HWTEST2_F(CommandListAppendRangesBarrierXeHpcCore, givenCallToAppendRangesBarrie EXPECT_TRUE(pipeControlCmd->getUnTypedDataPortCacheFlush()); } -HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhenKernelNotUsingSystemMemoryAllocationsAndEventNotHostSignalScopeThenExpectsNoSystemFenceUsed, IsXeHpcCore) { +HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, + givenHwSupportsSystemFenceWhenKernelNotUsingSystemMemoryAllocationsAndEventNotHostSignalScopeThenExpectsNoSystemFenceUsed, IsXeHpcCore) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; ze_result_t result = ZE_RESULT_SUCCESS; @@ -390,7 +391,8 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhen ASSERT_EQ(result, ZE_RESULT_SUCCESS); } -HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhenKernelUsingUsmHostMemoryAllocationsAndEventNotHostSignalScopeThenExpectsSystemFenceUsed, IsXeHpcCore) { +HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, + givenHwSupportsSystemFenceWhenKernelUsingUsmHostMemoryAllocationsAndEventNotHostSignalScopeThenExpectsNoSystemFenceUsed, IsXeHpcCore) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; ze_result_t result = ZE_RESULT_SUCCESS; @@ -451,13 +453,14 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhen auto walkerCmd = genCmdCast(*itor); auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); result = context->freeMem(ptr); ASSERT_EQ(result, ZE_RESULT_SUCCESS); } -HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhenMigrationOnComputeKernelUsingUsmSharedCpuMemoryAllocationsAndEventNotHostSignalScopeThenExpectsSystemFenceUsed, IsXeHpcCore) { +HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, + givenHwSupportsSystemFenceWhenMigrationOnComputeKernelUsingUsmSharedCpuMemoryAllocationsAndEventNotHostSignalScopeThenExpectsNoSystemFenceUsed, IsXeHpcCore) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; ze_result_t result = ZE_RESULT_SUCCESS; @@ -506,13 +509,14 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhen auto walkerCmd = genCmdCast(*itor); auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); result = context->freeMem(ptr); ASSERT_EQ(result, ZE_RESULT_SUCCESS); } -HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhenKernelUsingIndirectSystemMemoryAllocationsAndEventNotHostSignalScopeThenExpectsSystemFenceUsed, IsXeHpcCore) { +HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, + givenHwSupportsSystemFenceWhenKernelUsingIndirectSystemMemoryAllocationsAndEventNotHostSignalScopeThenExpectsNoSystemFenceUsed, IsXeHpcCore) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; ze_result_t result = ZE_RESULT_SUCCESS; @@ -577,13 +581,14 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhen auto walkerCmd = genCmdCast(*itor); auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); result = context->freeMem(ptr); ASSERT_EQ(result, ZE_RESULT_SUCCESS); } -HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhenKernelUsingDeviceMemoryAllocationsAndEventHostSignalScopeThenExpectsSystemFenceUsed, IsXeHpcCore) { +HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, + givenHwSupportsSystemFenceWhenKernelUsingDeviceMemoryAllocationsAndEventHostSignalScopeThenExpectsSystemFenceUsed, IsXeHpcCore) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; ze_result_t result = ZE_RESULT_SUCCESS; @@ -644,6 +649,74 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, givenHwSupportsSystemFenceWhen auto itor = find(commands.begin(), commands.end()); ASSERT_NE(itor, commands.end()); + auto walkerCmd = genCmdCast(*itor); + auto &postSyncData = walkerCmd->getPostSync(); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + + result = context->freeMem(ptr); + ASSERT_EQ(result, ZE_RESULT_SUCCESS); +} + +HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, + givenHwSupportsSystemFenceWhenKernelUsingUsmHostMemoryAllocationsAndEventHostSignalScopeThenExpectsSystemFenceUsed, IsXeHpcCore) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + + ze_result_t result = ZE_RESULT_SUCCESS; + + auto &hwInfo = *device->getNEODevice()->getRootDeviceEnvironment().getMutableHardwareInfo(); + auto &hwConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily); + + VariableBackup hwRevId{&hwInfo.platform.usRevId}; + hwRevId = hwConfig.getHwRevIdFromStepping(REVISION_B, hwInfo); + + constexpr size_t size = 4096u; + constexpr size_t alignment = 4096u; + void *ptr = nullptr; + + ze_host_mem_alloc_desc_t hostDesc = {}; + result = context->allocHostMem(&hostDesc, size, alignment, &ptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_NE(nullptr, ptr); + + Mock<::L0::Kernel> kernel; + auto mockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = mockModule.get(); + + auto allocData = driverHandle->getSvmAllocsManager()->getSVMAlloc(ptr); + ASSERT_NE(nullptr, allocData); + auto kernelAllocation = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex()); + ASSERT_NE(nullptr, kernelAllocation); + kernel.residencyContainer.push_back(kernelAllocation); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + kernel.setGroupSize(1, 1, 1); + ze_group_count_t groupCount{8, 1, 1}; + auto commandList = std::make_unique>>(); + result = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + CmdListKernelLaunchParams launchParams = {}; + result = commandList->appendLaunchKernelWithParams(&kernel, &groupCount, event.get(), launchParams); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList commands; + ASSERT_TRUE(CmdParse::parseCommandBuffer( + commands, + commandList->commandContainer.getCommandStream()->getCpuBase(), + commandList->commandContainer.getCommandStream()->getUsed())); + + auto itor = find(commands.begin(), commands.end()); + ASSERT_NE(itor, commands.end()); + auto walkerCmd = genCmdCast(*itor); auto &postSyncData = walkerCmd->getPostSync(); EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index b591296e67..5453a1bec3 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -110,7 +110,8 @@ inline void HardwareInterface::programWalker( numWorkGroups, walkerArgs.localWorkSizes, simd, dim, localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder); - EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), true}; + bool requiredSystemFence = kernel.isAnyKernelArgumentUsingSystemMemory() && walkerArgs.event != nullptr; + EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence}; EncodeDispatchKernel::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs); auto devices = queueCsr.getOsContext().getDeviceBitfield(); diff --git a/opencl/test/unit_test/xe_hpc_core/command_stream_receiver_hw_tests_xe_hpc_core.cpp b/opencl/test/unit_test/xe_hpc_core/command_stream_receiver_hw_tests_xe_hpc_core.cpp index d803c2929d..833bf48540 100644 --- a/opencl/test/unit_test/xe_hpc_core/command_stream_receiver_hw_tests_xe_hpc_core.cpp +++ b/opencl/test/unit_test/xe_hpc_core/command_stream_receiver_hw_tests_xe_hpc_core.cpp @@ -254,51 +254,218 @@ struct SystemMemoryFenceInDefaultConfigurationTest : public UltCommandStreamRece DebugManagerStateRestore restore; }; -HWTEST2_F(SystemMemoryFenceInDefaultConfigurationTest, whenEnqueueKernelIsCalledThenFenceCommandsCanBeGenerated, IsPVC) { +HWTEST2_F(SystemMemoryFenceInDefaultConfigurationTest, givenSpecificDeviceSteppingWhenEnqueueKernelIsCalledThenFenceCommandsNotGenerated, IsPVC) { using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; VariableBackup revisionId(&defaultHwInfo->platform.usRevId); - unsigned short revisions[] = {0x0, 0x3}; - for (auto revision : revisions) { - revisionId = revision; - UltClDeviceFactory ultClDeviceFactory{1, 0}; - auto isPvcXlA0Stepping = (revision == 0x0); - auto &clDevice = *ultClDeviceFactory.rootDevices[0]; + constexpr unsigned short revision = 0x0; + revisionId = revision; + UltClDeviceFactory ultClDeviceFactory{1, 0}; + auto &clDevice = *ultClDeviceFactory.rootDevices[0]; - MockKernelWithInternals kernel(clDevice); - MockContext context(&clDevice); - MockCommandQueueHw commandQueue(&context, &clDevice, nullptr); - auto &commandStreamReceiver = clDevice.getUltCommandStreamReceiver(); + MockKernelWithInternals kernel(clDevice); + MockContext context(&clDevice); + MockCommandQueueHw commandQueue(&context, &clDevice, nullptr); + auto &commandStreamReceiver = clDevice.getUltCommandStreamReceiver(); - size_t globalWorkSize[3] = {1, 1, 1}; - commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); + size_t globalWorkSize[3] = {1, 1, 1}; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); - ClHardwareParse hwParser; - hwParser.parseCommands(commandQueue); + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); - auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); - auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); - EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); - auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); - auto walkerCmd = genCmdCast(*itorComputeWalker); - auto &postSyncData = walkerCmd->getPostSync(); + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); - auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - if (isPvcXlA0Stepping) { - EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); - EXPECT_EQ(hwParser.cmdList.end(), itorMiMemFence); - } else { - EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); - ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); - auto fenceCmd = genCmdCast(*itorMiMemFence); - ASSERT_NE(nullptr, fenceCmd); - EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE, fenceCmd->getFenceType()); - } - } + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + EXPECT_EQ(hwParser.cmdList.end(), itorMiMemFence); +} + +XE_HPC_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTest, + givenNoEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenNoPostSyncFenceRequestDispatched) { + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + + VariableBackup revisionId(&defaultHwInfo->platform.usRevId); + + constexpr unsigned short revision = 0x3; + revisionId = revision; + UltClDeviceFactory ultClDeviceFactory{1, 0}; + auto &clDevice = *ultClDeviceFactory.rootDevices[0]; + + MockKernelWithInternals kernel(clDevice); + MockContext context(&clDevice); + MockCommandQueueHw commandQueue(&context, &clDevice, nullptr); + auto &commandStreamReceiver = clDevice.getUltCommandStreamReceiver(); + + size_t globalWorkSize[3] = {1, 1, 1}; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); + + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); + + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); + + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); + auto fenceCmd = genCmdCast(*itorMiMemFence); + ASSERT_NE(nullptr, fenceCmd); + EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE, fenceCmd->getFenceType()); +} + +XE_HPC_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTest, + givenNoEventProvidedWhenEnqueueKernelUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) { + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + + VariableBackup revisionId(&defaultHwInfo->platform.usRevId); + + constexpr unsigned short revision = 0x3; + revisionId = revision; + UltClDeviceFactory ultClDeviceFactory{1, 0}; + auto &clDevice = *ultClDeviceFactory.rootDevices[0]; + + MockKernelWithInternals kernel(clDevice); + MockContext context(&clDevice); + MockCommandQueueHw commandQueue(&context, &clDevice, nullptr); + auto &commandStreamReceiver = clDevice.getUltCommandStreamReceiver(); + + size_t globalWorkSize[3] = {1, 1, 1}; + kernel.mockKernel->anyKernelArgumentUsingSystemMemory = true; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); + + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); + + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); + + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); + auto fenceCmd = genCmdCast(*itorMiMemFence); + ASSERT_NE(nullptr, fenceCmd); + EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE, fenceCmd->getFenceType()); +} + +XE_HPC_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTest, + givenEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) { + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + + VariableBackup revisionId(&defaultHwInfo->platform.usRevId); + + constexpr unsigned short revision = 0x3; + revisionId = revision; + UltClDeviceFactory ultClDeviceFactory{1, 0}; + auto &clDevice = *ultClDeviceFactory.rootDevices[0]; + + MockKernelWithInternals kernel(clDevice); + MockContext context(&clDevice); + MockCommandQueueHw commandQueue(&context, &clDevice, nullptr); + auto &commandStreamReceiver = clDevice.getUltCommandStreamReceiver(); + + size_t globalWorkSize[3] = {1, 1, 1}; + cl_event kernelEvent{}; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &kernelEvent); + + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); + + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); + + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); + auto fenceCmd = genCmdCast(*itorMiMemFence); + ASSERT_NE(nullptr, fenceCmd); + EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE, fenceCmd->getFenceType()); + + auto event = castToObject(kernelEvent); + event->release(); +} + +XE_HPC_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTest, + givenEventProvidedWhenEnqueueKernelUsingSystemMemoryThenPostSyncFenceRequestDispatched) { + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + + VariableBackup revisionId(&defaultHwInfo->platform.usRevId); + + constexpr unsigned short revision = 0x3; + revisionId = revision; + UltClDeviceFactory ultClDeviceFactory{1, 0}; + auto &clDevice = *ultClDeviceFactory.rootDevices[0]; + + MockKernelWithInternals kernel(clDevice); + MockContext context(&clDevice); + MockCommandQueueHw commandQueue(&context, &clDevice, nullptr); + auto &commandStreamReceiver = clDevice.getUltCommandStreamReceiver(); + + size_t globalWorkSize[3] = {1, 1, 1}; + cl_event kernelEvent{}; + kernel.mockKernel->anyKernelArgumentUsingSystemMemory = true; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &kernelEvent); + + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); + + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); + + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); + ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); + auto fenceCmd = genCmdCast(*itorMiMemFence); + ASSERT_NE(nullptr, fenceCmd); + EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE, fenceCmd->getFenceType()); + + auto event = castToObject(kernelEvent); + event->release(); } diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index e0fcc614a1..8dbd6dc5e7 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -266,7 +266,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeWalkerArgs walkerArgs{ args.isCooperative ? KernelExecutionType::Concurrent : KernelExecutionType::Default, - args.isHostScopeSignalEvent || args.isKernelUsingSystemAllocation}; + args.isHostScopeSignalEvent && args.isKernelUsingSystemAllocation}; EncodeDispatchKernel::encodeAdditionalWalkerFields(hwInfo, walkerCmd, walkerArgs); PreemptionHelper::applyPreemptionWaCmdsBegin(listCmdBufferStream, *args.device); diff --git a/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp b/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp index 4e542d28ad..1c0b0cd5fa 100644 --- a/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp +++ b/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp @@ -347,9 +347,16 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenFenceAsPostSyncOperationInComp XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceAsPostSyncOperationInComputeWalkerWhenEnqueueKernelIsCalledThenDoNotGenerateFenceCommands) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + DebugManagerStateRestore restore; DebugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1); + auto &hwInfo = *pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); + auto &hwConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily); + + VariableBackup hwRevId{&hwInfo.platform.usRevId}; + hwRevId = hwConfig.getHwRevIdFromStepping(REVISION_A0, hwInfo); + uint32_t dims[] = {1, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); dispatchInterface->getCrossThreadDataSizeResult = 0u; @@ -372,14 +379,14 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceAsPostSy EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); } -XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceWhenHwSupportsSystemFenceWhenKernelUsesSystemFlagTrueThenExpectSystemFenceUsed) { +XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceWhenKernelUsesSystemMemoryFlagTrueAndNoHostSignalEventThenNotUseSystemFence) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; DebugManagerStateRestore restore; DebugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1); auto &hwInfo = *pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); - auto &hwConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily); + auto &hwConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily); unsigned short pvcRevB = hwConfig.getHwRevIdFromStepping(REVISION_B, hwInfo); VariableBackup hwRevId(&hwInfo.platform.usRevId, pvcRevB); @@ -405,17 +412,17 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceWhenHwSu auto walkerCmd = genCmdCast(*itor); auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); } -XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceWhenHwSupportsSystemFenceWhenEventHostScopeSignalFlagTrueThenExpectSystemFenceUsed) { +XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceWhenEventHostSignalScopeFlagTrueAndNoSystemMemoryThenNotUseSystemFence) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; DebugManagerStateRestore restore; DebugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1); auto &hwInfo = *pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); - auto &hwConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily); + auto &hwConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily); unsigned short pvcRevB = hwConfig.getHwRevIdFromStepping(REVISION_B, hwInfo); VariableBackup hwRevId(&hwInfo.platform.usRevId, pvcRevB); @@ -439,6 +446,39 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceWhenHwSu auto itor = find(commands.begin(), commands.end()); ASSERT_NE(itor, commands.end()); + auto walkerCmd = genCmdCast(*itor); + auto &postSyncData = walkerCmd->getPostSync(); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); +} + +XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDefaultSettingForFenceWhenKernelUsesSystemMemoryAndHostSignalEventFlagTrueThenUseSystemFence) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + + DebugManagerStateRestore restore; + DebugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1); + + auto &hwInfo = *pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); + auto &hwConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily); + + unsigned short pvcRevB = hwConfig.getHwRevIdFromStepping(REVISION_B, hwInfo); + VariableBackup hwRevId(&hwInfo.platform.usRevId, pvcRevB); + + uint32_t dims[] = {1, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + dispatchInterface->getCrossThreadDataSizeResult = 0; + + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, false); + dispatchArgs.isKernelUsingSystemAllocation = true; + dispatchArgs.isHostScopeSignalEvent = true; + + EncodeDispatchKernel::encode(*cmdContainer.get(), dispatchArgs); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); + + auto itor = find(commands.begin(), commands.end()); + ASSERT_NE(itor, commands.end()); + auto walkerCmd = genCmdCast(*itor); auto &postSyncData = walkerCmd->getPostSync(); EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest());