From cffe7f158a4215974dede2e2f6a430a86293a0f1 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Tue, 19 Apr 2022 07:00:53 +0000 Subject: [PATCH] Use postsync for copy and fill Related-To: NEO-5968 Signed-off-by: Aravind Gopalakrishnan --- level_zero/core/source/cmdlist/cmdlist_hw.h | 5 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 56 ++++-- .../core/source/cmdlist/cmdlist_hw_base.inl | 31 +++ .../cmdlist/cmdlist_hw_xehp_and_later.inl | 47 ++++- level_zero/core/source/event/event.cpp | 4 +- level_zero/core/source/event/event_impl.inl | 3 +- .../test_cmdlist_append_launch_kernel_1.cpp | 113 +++++++++++ .../cmdlist/test_cmdlist_append_memory.cpp | 3 +- .../xe_hp_core/test_cmdlist_xe_hp_core.cpp | 180 ++++++++++++++++++ 9 files changed, 411 insertions(+), 31 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 4e57d3af0b..0d30d164b5 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -235,6 +235,11 @@ struct CommandListCoreFamily : CommandListImp { void appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker); void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker); void appendSignalEventPostWalker(ze_event_handle_t hEvent, bool workloadPartition); + void programEventL3Flush(ze_event_handle_t hEvent, + Device *device, + uint32_t partitionCount, + NEO::CommandContainer &commandContainer); + void adjustEventKernelCount(ze_event_handle_t hEvent); void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired); void appendComputeBarrierCommand(); NEO::PipeControlArgs createBarrierFlags(); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index af0be219af..5bed932a81 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -224,8 +224,17 @@ ze_result_t CommandListCoreFamily::appendLaunchCooperativeKernel( return ret; } - return appendLaunchKernelWithParams(hKernel, pLaunchFuncArgs, - hSignalEvent, false, false, true); + ret = appendLaunchKernelWithParams(hKernel, pLaunchFuncArgs, + hSignalEvent, false, false, true); + if (ret) { + return ret; + } + + if (hSignalEvent) { + programEventL3Flush(hSignalEvent, this->device, this->partitionCount, commandContainer); + } + + return ret; } template @@ -242,6 +251,12 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelIndirect(ze_ appendEventForProfiling(hEvent, true, false); ret = appendLaunchKernelWithParams(hKernel, pDispatchArgumentsBuffer, nullptr, true, false, false); + if (ret) { + return ret; + } + if (hEvent) { + programEventL3Flush(hEvent, this->device, this->partitionCount, commandContainer); + } appendSignalEventPostWalker(hEvent, false); return ret; @@ -276,7 +291,9 @@ ze_result_t CommandListCoreFamily::appendLaunchMultipleKernelsInd return ret; } } - + if (hEvent) { + programEventL3Flush(hEvent, this->device, this->partitionCount, commandContainer); + } appendSignalEventPostWalker(hEvent, false); return ret; @@ -800,22 +817,6 @@ ze_result_t CommandListCoreFamily::appendMemAdvise(ze_device_hand return ZE_RESULT_ERROR_INVALID_ARGUMENT; } -template -ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(ze_kernel_handle_t hKernel, - const ze_group_count_t *pThreadGroupDimensions, - ze_event_handle_t hEvent) { - return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false); -} - -template -void CommandListCoreFamily::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) { - if (beforeWalker) { - appendEventForProfiling(hEvent, true, false); - } else { - appendSignalEventPostWalker(hEvent, false); - } -} - template ze_result_t CommandListCoreFamily::appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, @@ -1069,6 +1070,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, } appendEventForProfilingAllWalkers(hSignalEvent, true); + adjustEventKernelCount(hSignalEvent); if (ret == ZE_RESULT_SUCCESS && leftSize) { Builtin func = Builtin::CopyBufferToBufferSide; @@ -1128,6 +1130,9 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, isStateless); } + if (hSignalEvent) { + programEventL3Flush(hSignalEvent, this->device, this->partitionCount, commandContainer); + } appendEventForProfilingAllWalkers(hSignalEvent, false); const auto &hwInfo = this->device->getHwInfo(); @@ -1135,9 +1140,12 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, auto event = Event::fromHandle(hSignalEvent); if (event) { dstAllocationStruct.needsFlush &= !event->signalScope; + dstAllocationStruct.needsFlush &= !event->l3FlushWaApplied; } - if (dstAllocationStruct.needsFlush && !isCopyOnly()) { + dstAllocationStruct.needsFlush &= !isCopyOnly(); + + if (dstAllocationStruct.needsFlush) { NEO::PipeControlArgs args; args.dcFlushEnable = true; NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); @@ -1452,6 +1460,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, builtinFunction->setArgumentValue(2, sizeof(value), &value); appendEventForProfilingAllWalkers(hSignalEvent, true); + adjustEventKernelCount(hSignalEvent); uint32_t groups = static_cast(size) / groupSizeX; ze_group_count_t dispatchFuncArgs{groups, 1u, 1u}; @@ -1526,6 +1535,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, builtinFunction->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls); appendEventForProfilingAllWalkers(hSignalEvent, true); + adjustEventKernelCount(hSignalEvent); ze_group_count_t dispatchFuncArgs{groups, 1u, 1u}; res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent); @@ -1564,6 +1574,9 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } } + if (hSignalEvent) { + programEventL3Flush(hSignalEvent, this->device, this->partitionCount, commandContainer); + } appendEventForProfilingAllWalkers(hSignalEvent, false); const auto &hwInfo = this->device->getHwInfo(); @@ -1571,8 +1584,11 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, auto event = Event::fromHandle(hSignalEvent); if (event) { hostPointerNeedsFlush &= !event->signalScope; + hostPointerNeedsFlush &= !event->l3FlushWaApplied; } + hostPointerNeedsFlush &= !isCopyOnly(); + if (hostPointerNeedsFlush) { NEO::PipeControlArgs args; args.dcFlushEnable = true; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 027e79a19e..35da7a0334 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -32,6 +32,26 @@ size_t CommandListCoreFamily::getReserveSshSize() { return helper.getRenderSurfaceStateSize(); } +template +void CommandListCoreFamily::programEventL3Flush(ze_event_handle_t hEvent, + Device *device, + uint32_t partitionCount, + NEO::CommandContainer &commandContainer) { +} + +template +void CommandListCoreFamily::adjustEventKernelCount(ze_event_handle_t hEvent) { +} + +template +void CommandListCoreFamily::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) { + if (beforeWalker) { + appendEventForProfiling(hEvent, true, false); + } else { + appendSignalEventPostWalker(hEvent, false); + } +} + template ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, @@ -174,6 +194,17 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z return ZE_RESULT_SUCCESS; } +template +ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(ze_kernel_handle_t hKernel, + const ze_group_count_t *pThreadGroupDimensions, + ze_event_handle_t hEvent) { + if (hEvent) { + auto event = Event::fromHandle(hEvent); + event->kernelCount = 1; + } + return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false); +} + template void CommandListCoreFamily::appendMultiPartitionPrologue(uint32_t partitionDataSize) {} diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index d7bd0ed666..36769953ec 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -84,14 +84,20 @@ void CommandListCoreFamily::applyMemoryRangesBarrier(uint32_t num } template -void programEventL3Flush(ze_event_handle_t hEvent, - Device *device, - uint32_t partitionCount, - NEO::CommandContainer &commandContainer) { +void CommandListCoreFamily::programEventL3Flush(ze_event_handle_t hEvent, + Device *device, + uint32_t partitionCount, + NEO::CommandContainer &commandContainer) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using POST_SYNC_OPERATION = typename GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION; auto event = Event::fromHandle(hEvent); + const auto &hwInfo = this->device->getHwInfo(); + bool L3FlushEnable = NEO::MemorySynchronizationCommands::getDcFlushEnable(event->signalScope, hwInfo); + if (!L3FlushEnable || isCopyOnly()) { + return; + } + auto eventPartitionOffset = (partitionCount > 1) ? (partitionCount * event->getSinglePacketSize()) : event->getSinglePacketSize(); uint64_t eventAddress = event->getPacketAddress(device) + eventPartitionOffset; @@ -121,6 +127,13 @@ void programEventL3Flush(ze_event_handle_t hEvent, args); } +template +void CommandListCoreFamily::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) { + if (hEvent && isCopyOnly()) { + appendSignalEventPostWalker(hEvent, false); + } +} + template ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, @@ -165,6 +178,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z commandContainer.addToResidencyContainer(eventAlloc); L3FlushEnable = NEO::MemorySynchronizationCommands::getDcFlushEnable(event->signalScope, hwInfo); isTimestampEvent = event->isUsingContextEndOffset(); + eventAddress = event->getPacketAddress(this->device); } @@ -238,9 +252,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z if (partitionCount > 1) { event->setPacketsInUse(partitionCount); } - if (L3FlushEnable) { - programEventL3Flush(hEvent, this->device, partitionCount, commandContainer); - } + programEventL3Flush(hEvent, this->device, this->partitionCount, commandContainer); } if (neoDevice->getDebugger()) { @@ -292,6 +304,27 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z return ZE_RESULT_SUCCESS; } +template +ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(ze_kernel_handle_t hKernel, + const ze_group_count_t *pThreadGroupDimensions, + ze_event_handle_t hEvent) { + if (hEvent) { + auto event = Event::fromHandle(hEvent); + event->kernelCount += 1; + } + return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, hEvent, false, false, false); +} + +template +void CommandListCoreFamily::adjustEventKernelCount(ze_event_handle_t hEvent) { + if (hEvent) { + auto event = Event::fromHandle(hEvent); + if (!isCopyOnly()) { + event->kernelCount = 0u; + } + } +} + template void CommandListCoreFamily::appendMultiPartitionPrologue(uint32_t partitionDataSize) { NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), diff --git a/level_zero/core/source/event/event.cpp b/level_zero/core/source/event/event.cpp index 8810955ee5..29825c55c9 100644 --- a/level_zero/core/source/event/event.cpp +++ b/level_zero/core/source/event/event.cpp @@ -80,8 +80,8 @@ ze_result_t EventPoolImp::initialize(DriverHandle *driver, Context *context, uin eventSize = static_cast(alignUp(EventPacketsCount::eventPackets * hwHelper.getSingleTimestampPacketSize(), eventAlignment)); size_t alignedSize = alignUp(numEvents * eventSize, MemoryConstants::pageSize64k); - NEO::AllocationType allocationType = isEventPoolTimestampFlagSet() ? NEO::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER - : NEO::AllocationType::BUFFER_HOST_MEMORY; + NEO::AllocationType allocationType = NEO::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER; + if (this->devices.size() > 1) { useDeviceAlloc = false; } diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index 636add0a05..1eb25e5948 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -393,7 +393,8 @@ uint32_t EventImp::getPacketsUsedInLastKernel() { template void EventImp::setPacketsInUse(uint32_t value) { - kernelEventCompletionData[getCurrKernelDataIndex()].setPacketsUsed(value); + auto kernelIndex = getCurrKernelDataIndex(); + kernelEventCompletionData[kernelIndex].setPacketsUsed(value); } template diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp index 5bef27728e..65695b522b 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp @@ -855,6 +855,39 @@ HWTEST_F(CommandListAppendLaunchKernelWithImplicitArgs, givenIndirectDispatchWit context->freeMem(alloc); } +HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWithEventThenSuccessIsReturned) { + Mock<::L0::Kernel> kernel; + kernel.groupSize[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.workDim = 4; + ze_result_t returnValue; + std::unique_ptr commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + eventPoolDesc.count = 1; + + ze_event_desc_t eventDesc = {}; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.index = 0; + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + std::unique_ptr event(Event::create(eventPool.get(), &eventDesc, device)); + ze_event_handle_t hEventHandle = event->toHandle(); + + void *alloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc); + ASSERT_EQ(result, ZE_RESULT_SUCCESS); + + result = commandList->appendLaunchKernelIndirect(kernel.toHandle(), + static_cast(alloc), + hEventHandle, 0, nullptr); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + context->freeMem(alloc); +} HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWorkGroupCountAndGlobalWorkSizeAndWorkDimIsSetInCrossThreadData) { using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; @@ -1202,6 +1235,37 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandListAppendLaunchKernel, givenAppendLaunchMult context->freeMem(reinterpret_cast(numLaunchArgs)); } +HWCMDTEST_F(IGFX_GEN8_CORE, CommandListAppendLaunchKernel, givenAppendLaunchMultipleKernelsWithEventThenSuccessIsReturned) { + createKernel(); + + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; + using MI_MATH = typename FamilyType::MI_MATH; + ze_result_t returnValue; + auto commandList = std::unique_ptr(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); + const ze_kernel_handle_t launchFn[3] = {kernel->toHandle(), kernel->toHandle(), kernel->toHandle()}; + uint32_t *numLaunchArgs; + const uint32_t numKernels = 3; + ze_device_mem_alloc_desc_t deviceDesc = {}; + auto result = context->allocDeviceMem( + device->toHandle(), &deviceDesc, 16384u, 4096u, reinterpret_cast(&numLaunchArgs)); + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + eventPoolDesc.count = 1; + + ze_event_desc_t eventDesc = {}; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.index = 0; + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + std::unique_ptr event(Event::create(eventPool.get(), &eventDesc, device)); + ze_event_handle_t hEventHandle = event->toHandle(); + + result = commandList->appendLaunchMultipleKernelsIndirect(numKernels, launchFn, numLaunchArgs, nullptr, hEventHandle, 0, nullptr); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + context->freeMem(reinterpret_cast(numLaunchArgs)); +} + HWTEST_F(CommandListAppendLaunchKernel, givenInvalidEventListWhenAppendLaunchCooperativeKernelIsCalledThenErrorIsReturned) { createKernel(); @@ -1213,6 +1277,55 @@ HWTEST_F(CommandListAppendLaunchKernel, givenInvalidEventListWhenAppendLaunchCoo EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, returnValue); } +HWTEST_F(CommandListAppendLaunchKernel, givenAppendLaunchCooperativeKernelIsCalledWithEventWithHostScopeThenSuccessIsReturned) { + createKernel(); + + ze_group_count_t groupCount{1, 1, 1}; + ze_result_t returnValue; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + returnValue = commandList->appendLaunchCooperativeKernel(kernel->toHandle(), &groupCount, event->toHandle(), 0, nullptr); + + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); +} + +HWTEST2_F(CommandListAppendLaunchKernel, givenAppendLaunchCooperativeKernelIsCalledWithNoEventScopeThenSuccessIsReturnedAndL3WaNotApplied, IsXeHpCore) { + createKernel(); + + ze_group_count_t groupCount{1, 1, 1}; + ze_result_t returnValue; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + returnValue = commandList->appendLaunchCooperativeKernel(kernel->toHandle(), &groupCount, event->toHandle(), 0, nullptr); + + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + EXPECT_FALSE(event->l3FlushWaApplied); +} + HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLaunchCooperativeKernelIsCalledThenCorrectValueIsReturned, IsAtLeastSkl) { Mock<::L0::Kernel> kernel; auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp index bf80640eb2..eb6006b51e 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -320,6 +320,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListWhenTimestampPassedToMemoryCopyThenA EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + EXPECT_EQ(1u, event->getPacketsInUse()); commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100, event->toHandle(), 0, nullptr); EXPECT_GT(commandList.appendMemoryCopyKernelWithGACalled, 0u); EXPECT_EQ(commandList.appendMemoryCopyBlitCalled, 0u); diff --git a/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp b/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp index 27a0070e55..4a69c31e85 100644 --- a/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp @@ -404,6 +404,124 @@ HWTEST2_F(CommandListAppendLaunchKernelL3Flush, givenKernelWithEventHostScopeWit EXPECT_EQ(data[0].globalEnd, tsResult.global.kernelEnd); } +HWTEST2_F(CommandListAppendLaunchKernelL3Flush, givenCopyCommandListWithAppendMemoryCopyThenL3FlushWaNotApplied, IsXeHpCore) { + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Copy, 0u); + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + void *dstBuffer = nullptr; + ze_host_mem_alloc_desc_t hostDesc = {}; + result = context->allocHostMem(&hostDesc, 16384u, 4090u, &dstBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + ze_device_mem_alloc_desc_t deviceDesc = {}; + void *srcBuffer = nullptr; + result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &srcBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->appendMemoryCopy(dstBuffer, srcBuffer, 16384u, event->toHandle(), 0, nullptr); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_FALSE(event->l3FlushWaApplied); + context->freeMem(dstBuffer); + context->freeMem(srcBuffer); +} + +HWTEST2_F(CommandListAppendLaunchKernelL3Flush, givenCopyCommandListWithAppendMemoryFillThenL3FlushWaNotApplied, IsXeHpCore) { + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Copy, 0u); + + void *dstBuffer = nullptr; + ze_host_mem_alloc_desc_t hostDesc = {}; + result = context->allocHostMem(&hostDesc, 16384u, 4090u, &dstBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + int one = 1; + result = commandList->appendMemoryFill(dstBuffer, reinterpret_cast(&one), sizeof(one), 4096u, + event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_FALSE(event->l3FlushWaApplied); + context->freeMem(dstBuffer); +} + +template +class MockCommandListKernelLaunchError : public WhiteBox<::L0::CommandListCoreFamily> { + public: + MockCommandListKernelLaunchError() : WhiteBox<::L0::CommandListCoreFamily>() {} + + ze_result_t appendLaunchKernelWithParams(ze_kernel_handle_t hKernel, + const ze_group_count_t *pThreadGroupDimensions, + ze_event_handle_t hEvent, + bool isIndirect, + bool isPredicate, + bool isCooperative) override { + return ZE_RESULT_ERROR_UNKNOWN; + } +}; + +HWTEST2_F(CommandListCreate, whenReturningErrorFromLaunchKernelWithParamsThenLaunchIndirectIsUnsuccessful, IsXeHpCore) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + Mock<::L0::Kernel> kernel; + kernel.groupSize[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.workDim = 4; + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + + void *alloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc); + ASSERT_EQ(result, ZE_RESULT_SUCCESS); + + result = commandList->appendLaunchKernelIndirect(kernel.toHandle(), + static_cast(alloc), + nullptr, 0, nullptr); + EXPECT_EQ(result, ZE_RESULT_ERROR_UNKNOWN); + context->freeMem(alloc); +} + +HWTEST2_F(CommandListCreate, whenReturningErrorFromLaunchKernelWithParamsThenLaunchiCooperativeKernelIsUnsuccessful, IsXeHpCore) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + Mock<::L0::Kernel> kernel; + kernel.groupSize[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2; + kernel.descriptor.payloadMappings.dispatchTraits.workDim = 4; + + ze_group_count_t groupCount{1, 1, 1}; + ze_result_t returnValue; + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + + returnValue = commandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr); + EXPECT_EQ(returnValue, ZE_RESULT_ERROR_UNKNOWN); +} + HWTEST2_F(CommandListCreate, WhenCreatingCommandListThenBindingTablePoolAllocAddedToBatchBuffer, IsXeHpCore) { using _3DSTATE_BINDING_TABLE_POOL_ALLOC = typename FamilyType::_3DSTATE_BINDING_TABLE_POOL_ALLOC; @@ -520,6 +638,68 @@ HWTEST2_F(CommandListCreate, givenNotCopyCommandListWhenProfilingEventAfterComma EXPECT_EQ(cmd->getSourceRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); } +HWTEST2_F(CommandListCreate, givenCommandListWhenAppendMemoryFillWithSignalEventThenL3FlushWaApplied, IsXeHpCore) { + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + void *dstBuffer = nullptr; + ze_host_mem_alloc_desc_t hostDesc = {}; + result = context->allocHostMem(&hostDesc, 16384u, 4090u, &dstBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + int one = 1; + result = commandList->appendMemoryFill(dstBuffer, reinterpret_cast(&one), sizeof(one), 4096u, + event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_TRUE(event->l3FlushWaApplied); + context->freeMem(dstBuffer); +} + +HWTEST2_F(CommandListCreate, givenCommandListWhenAppendMemoryCopyWithSignalEventThenL3FlushWaApplied, IsXeHpCore) { + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + void *dstBuffer = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &dstBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + void *srcBuffer = nullptr; + result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &srcBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->appendMemoryCopy(dstBuffer, srcBuffer, 16384u, event->toHandle(), 0, nullptr); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_TRUE(event->l3FlushWaApplied); + context->freeMem(dstBuffer); + context->freeMem(srcBuffer); +} + HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingEventThenStoreRegCommandIsAdded, IsXeHpCore) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;