From 37ec3cb74e856cf80ca8df90293e100c842eabce Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 Oct 2021 19:05:16 +0000 Subject: [PATCH] Events workaround for L3Flush issue Related-To: LOCI-2361 Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Vinod Tipparaju --- .../cmdlist/cmdlist_hw_xehp_and_later.inl | 43 +++++- level_zero/core/source/event/event.h | 7 +- level_zero/core/source/event/event_impl.inl | 112 +++++++------- .../unit_tests/sources/event/test_event.cpp | 28 ++-- .../xe_hp_core/test_cmdlist_xe_hp_core.cpp | 140 ++++++++++++++++++ 5 files changed, 260 insertions(+), 70 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 7fc92ce393..a7a363154c 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -110,6 +110,44 @@ void CommandListCoreFamily::applyMemoryRangesBarrier(uint32_t num } } +template +void programEventL3Flush(ze_event_handle_t hEvent, + Device *device, + uint32_t partitionCount, + NEO::CommandContainer &commandContainer) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + using POST_SYNC_OPERATION = typename GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION; + auto event = Event::fromHandle(hEvent); + + auto eventPartitionOffset = (partitionCount > 1) ? (partitionCount * event->getSinglePacketSize()) + : event->getSinglePacketSize(); + uint64_t eventAddress = event->getPacketAddress(device) + eventPartitionOffset; + if (event->isEventTimestampFlagSet()) { + eventAddress += event->getContextEndOffset(); + } + + if (partitionCount > 1) { + event->setPacketsInUse(event->getPacketsInUse() + partitionCount); + } else { + event->setPacketsInUse(event->getPacketsInUse() + 1); + } + + NEO::PipeControlArgs args; + args.dcFlushEnable = true; + if (partitionCount > 1) { + args.workloadPartitionOffset = true; + NEO::EncodeSetMMIO::encodeIMM(*commandContainer.getCommandStream(), + NEO::PartitionRegisters::addressOffsetCCSOffset, + static_cast(event->getSinglePacketSize()), + true); + } + NEO::MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *commandContainer.getCommandStream(), POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + eventAddress, Event::STATE_SIGNALED, + commandContainer.getDevice()->getHardwareInfo(), + args); +} + template ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, @@ -228,9 +266,12 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z this->partitionCount = std::max(partitionCount, this->partitionCount); if (hEvent) { auto event = Event::fromHandle(hEvent); - if (isTimestampEvent && partitionCount > 1) { + if (partitionCount > 1) { event->setPacketsInUse(partitionCount); } + if (L3FlushEnable) { + programEventL3Flush(hEvent, this->device, partitionCount, commandContainer); + } } if (neoDevice->getDebugger()) { diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index 16ed4003c2..ed4d6e1d65 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -95,7 +95,7 @@ struct Event : _ze_event_handle_t { }; template -class KernelTimestampsData : public NEO::TimestampPackets { +class KernelEventCompletionData : public NEO::TimestampPackets { public: uint32_t getPacketsUsed() const { return packetsUsed; } void setPacketsUsed(uint32_t value) { packetsUsed = value; } @@ -139,7 +139,7 @@ struct EventImp : public Event { size_t getSinglePacketSize() const override { return NEO::TimestampPackets::getSinglePacketSize(); }; ze_result_t hostEventSetValue(uint32_t eventValue) override; - std::unique_ptr[]> kernelTimestampsData; + std::unique_ptr[]> kernelEventCompletionData; Device *device; int index; @@ -148,8 +148,9 @@ struct EventImp : public Event { protected: ze_result_t calculateProfilingData(); ze_result_t queryStatusKernelTimestamp(); + ze_result_t queryStatusNonTimestamp(); ze_result_t hostEventSetValueTimestamps(TagSizeT eventVal); - void assignTimestampData(void *address); + void assignKernelEventCompletionData(void *address); }; struct EventPool : _ze_event_pool_handle_t { diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index 6d64123d45..540297110f 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -15,8 +15,8 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device * if (eventPool->isEventPoolTimestampFlagSet()) { event->setEventTimestampFlag(true); - event->kernelTimestampsData = std::make_unique[]>(EventPacketsCount::maxKernelSplit); } + event->kernelEventCompletionData = std::make_unique[]>(EventPacketsCount::maxKernelSplit); auto alloc = eventPool->getAllocation().getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex()); @@ -49,24 +49,24 @@ NEO::GraphicsAllocation &EventImp::getAllocation(Device *device) { template ze_result_t EventImp::calculateProfilingData() { - globalStartTS = kernelTimestampsData[0].getGlobalStartValue(0); - globalEndTS = kernelTimestampsData[0].getGlobalEndValue(0); - contextStartTS = kernelTimestampsData[0].getContextStartValue(0); - contextEndTS = kernelTimestampsData[0].getContextEndValue(0); + globalStartTS = kernelEventCompletionData[0].getGlobalStartValue(0); + globalEndTS = kernelEventCompletionData[0].getGlobalEndValue(0); + contextStartTS = kernelEventCompletionData[0].getContextStartValue(0); + contextEndTS = kernelEventCompletionData[0].getContextEndValue(0); for (uint32_t i = 0; i < kernelCount; i++) { - for (auto packetId = 0u; packetId < kernelTimestampsData[i].getPacketsUsed(); packetId++) { - if (globalStartTS > kernelTimestampsData[i].getGlobalStartValue(packetId)) { - globalStartTS = kernelTimestampsData[i].getGlobalStartValue(packetId); + for (auto packetId = 0u; packetId < kernelEventCompletionData[i].getPacketsUsed(); packetId++) { + if (globalStartTS > kernelEventCompletionData[i].getGlobalStartValue(packetId)) { + globalStartTS = kernelEventCompletionData[i].getGlobalStartValue(packetId); } - if (contextStartTS > kernelTimestampsData[i].getContextStartValue(packetId)) { - contextStartTS = kernelTimestampsData[i].getContextStartValue(packetId); + if (contextStartTS > kernelEventCompletionData[i].getContextStartValue(packetId)) { + contextStartTS = kernelEventCompletionData[i].getContextStartValue(packetId); } - if (contextEndTS < kernelTimestampsData[i].getContextEndValue(packetId)) { - contextEndTS = kernelTimestampsData[i].getContextEndValue(packetId); + if (contextEndTS < kernelEventCompletionData[i].getContextEndValue(packetId)) { + contextEndTS = kernelEventCompletionData[i].getContextEndValue(packetId); } - if (globalEndTS < kernelTimestampsData[i].getGlobalEndValue(packetId)) { - globalEndTS = kernelTimestampsData[i].getGlobalEndValue(packetId); + if (globalEndTS < kernelEventCompletionData[i].getGlobalEndValue(packetId)) { + globalEndTS = kernelEventCompletionData[i].getGlobalEndValue(packetId); } } } @@ -75,11 +75,12 @@ ze_result_t EventImp::calculateProfilingData() { } template -void EventImp::assignTimestampData(void *address) { +void EventImp::assignKernelEventCompletionData(void *address) { for (uint32_t i = 0; i < kernelCount; i++) { - uint32_t packetsToCopy = kernelTimestampsData[i].getPacketsUsed(); + uint32_t packetsToCopy = 0; + packetsToCopy = kernelEventCompletionData[i].getPacketsUsed(); for (uint32_t packetId = 0; packetId < packetsToCopy; packetId++) { - kernelTimestampsData[i].assignDataToAllTimestamps(packetId, address); + kernelEventCompletionData[i].assignDataToAllTimestamps(packetId, address); address = ptrOffset(address, NEO::TimestampPackets::getSinglePacketSize()); } } @@ -87,11 +88,27 @@ void EventImp::assignTimestampData(void *address) { template ze_result_t EventImp::queryStatusKernelTimestamp() { - assignTimestampData(hostAddress); + assignKernelEventCompletionData(hostAddress); + uint32_t queryVal = Event::STATE_CLEARED; for (uint32_t i = 0; i < kernelCount; i++) { - uint32_t packetsToCheck = kernelTimestampsData[i].getPacketsUsed(); + uint32_t packetsToCheck = kernelEventCompletionData[i].getPacketsUsed(); for (uint32_t packetId = 0; packetId < packetsToCheck; packetId++) { - if (kernelTimestampsData[i].getContextEndValue(packetId) == Event::STATE_CLEARED) { + if (kernelEventCompletionData[i].getContextEndValue(packetId) == queryVal) { + return ZE_RESULT_NOT_READY; + } + } + } + return ZE_RESULT_SUCCESS; +} + +template +ze_result_t EventImp::queryStatusNonTimestamp() { + assignKernelEventCompletionData(hostAddress); + uint32_t queryVal = Event::STATE_CLEARED; + for (uint32_t i = 0; i < kernelCount; i++) { + uint32_t packetsToCheck = kernelEventCompletionData[i].getPacketsUsed(); + for (uint32_t packetId = 0; packetId < packetsToCheck; packetId++) { + if (kernelEventCompletionData[i].getContextStartValue(packetId) == queryVal) { return ZE_RESULT_NOT_READY; } } @@ -102,7 +119,6 @@ ze_result_t EventImp::queryStatusKernelTimestamp() { template ze_result_t EventImp::queryStatus() { uint64_t *hostAddr = static_cast(hostAddress); - uint32_t queryVal = Event::STATE_CLEARED; if (metricStreamer != nullptr) { *hostAddr = metricStreamer->getNotificationState(); @@ -110,9 +126,9 @@ ze_result_t EventImp::queryStatus() { this->csr->downloadAllocations(); if (isEventTimestampFlagSet()) { return queryStatusKernelTimestamp(); + } else { + return queryStatusNonTimestamp(); } - memcpy_s(static_cast(&queryVal), sizeof(uint32_t), static_cast(hostAddr), sizeof(uint32_t)); - return (queryVal == Event::STATE_CLEARED) ? ZE_RESULT_NOT_READY : ZE_RESULT_SUCCESS; } template @@ -130,7 +146,7 @@ ze_result_t EventImp::hostEventSetValueTimestamps(TagSizeT eventVal) { } }; for (uint32_t i = 0; i < kernelCount; i++) { - uint32_t packetsToSet = kernelTimestampsData[i].getPacketsUsed(); + uint32_t packetsToSet = kernelEventCompletionData[i].getPacketsUsed(); for (uint32_t j = 0; j < packetsToSet; j++) { eventTsSetFunc(baseAddr + NEO::TimestampPackets::getContextStartOffset()); eventTsSetFunc(baseAddr + NEO::TimestampPackets::getGlobalStartOffset()); @@ -139,7 +155,7 @@ ze_result_t EventImp::hostEventSetValueTimestamps(TagSizeT eventVal) { baseAddr += NEO::TimestampPackets::getSinglePacketSize(); } } - assignTimestampData(hostAddress); + assignKernelEventCompletionData(hostAddress); return ZE_RESULT_SUCCESS; } @@ -208,14 +224,12 @@ ze_result_t EventImp::reset() { if (isEventTimestampFlagSet()) { kernelCount = EventPacketsCount::maxKernelSplit; for (uint32_t i = 0; i < kernelCount; i++) { - kernelTimestampsData[i].setPacketsUsed(NEO::TimestampPacketSizeControl::preferredPacketCount); + kernelEventCompletionData[i].setPacketsUsed(NEO::TimestampPacketSizeControl::preferredPacketCount); } - hostEventSetValue(Event::STATE_INITIAL); - resetPackets(); - return ZE_RESULT_SUCCESS; - } else { - return hostEventSetValue(Event::STATE_INITIAL); } + hostEventSetValue(Event::STATE_INITIAL); + resetPackets(); + return ZE_RESULT_SUCCESS; } template @@ -227,7 +241,7 @@ ze_result_t EventImp::queryKernelTimestamp(ze_kernel_timestamp_result_ return ZE_RESULT_NOT_READY; } - assignTimestampData(hostAddress); + assignKernelEventCompletionData(hostAddress); calculateProfilingData(); auto eventTsSetFunc = [&](uint64_t ×tampFieldToCopy, uint64_t ×tampFieldForWriting) { @@ -288,10 +302,10 @@ ze_result_t EventImp::queryTimestampsExp(Device *device, uint32_t *pCo packetId = static_cast(deviceImp->neoDevice)->getSubDeviceIndex(); } - globalStartTs = kernelTimestampsData[timestampPacket].getGlobalStartValue(packetId); - contextStartTs = kernelTimestampsData[timestampPacket].getContextStartValue(packetId); - contextEndTs = kernelTimestampsData[timestampPacket].getContextEndValue(packetId); - globalEndTs = kernelTimestampsData[timestampPacket].getGlobalEndValue(packetId); + globalStartTs = kernelEventCompletionData[timestampPacket].getGlobalStartValue(packetId); + contextStartTs = kernelEventCompletionData[timestampPacket].getContextStartValue(packetId); + contextEndTs = kernelEventCompletionData[timestampPacket].getContextEndValue(packetId); + globalEndTs = kernelEventCompletionData[timestampPacket].getGlobalEndValue(packetId); queryTsEventAssignFunc(result.global.kernelStart, globalStartTs); queryTsEventAssignFunc(result.context.kernelStart, contextStartTs); @@ -305,37 +319,31 @@ ze_result_t EventImp::queryTimestampsExp(Device *device, uint32_t *pCo template void EventImp::resetPackets() { for (uint32_t i = 0; i < kernelCount; i++) { - kernelTimestampsData[i].setPacketsUsed(1); + kernelEventCompletionData[i].setPacketsUsed(1); } kernelCount = 1; } template uint32_t EventImp::getPacketsInUse() { - if (isEventTimestampFlagSet()) { - uint32_t packetsInUse = 0; - for (uint32_t i = 0; i < kernelCount; i++) { - packetsInUse += kernelTimestampsData[i].getPacketsUsed(); - }; - return packetsInUse; - } else { - return 1; + uint32_t packetsInUse = 0; + for (uint32_t i = 0; i < kernelCount; i++) { + packetsInUse += kernelEventCompletionData[i].getPacketsUsed(); } + return packetsInUse; } template void EventImp::setPacketsInUse(uint32_t value) { - kernelTimestampsData[getCurrKernelDataIndex()].setPacketsUsed(value); -}; + kernelEventCompletionData[getCurrKernelDataIndex()].setPacketsUsed(value); +} template uint64_t EventImp::getPacketAddress(Device *device) { uint64_t address = getGpuAddress(device); - if (isEventTimestampFlagSet() && kernelCount > 1) { - for (uint32_t i = 0; i < kernelCount - 1; i++) { - address += kernelTimestampsData[i].getPacketsUsed() * - NEO::TimestampPackets::getSinglePacketSize(); - } + for (uint32_t i = 0; i < kernelCount - 1; i++) { + address += kernelEventCompletionData[i].getPacketsUsed() * + NEO::TimestampPackets::getSinglePacketSize(); } return address; } diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index 1c9781484d..422f6e7fb8 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -640,15 +640,15 @@ TEST_F(TimestampEventCreate, givenEventCreatedWithTimestampThenIsTimestampEventF } TEST_F(TimestampEventCreate, givenEventTimestampsCreatedWhenResetIsInvokeThenCorrectDataAreSet) { - EXPECT_NE(nullptr, event->kernelTimestampsData); + EXPECT_NE(nullptr, event->kernelEventCompletionData); for (auto j = 0u; j < EventPacketsCount::maxKernelSplit; j++) { for (auto i = 0u; i < NEO::TimestampPacketSizeControl::preferredPacketCount; i++) { - EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelTimestampsData[j].getContextStartValue(i)); - EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelTimestampsData[j].getGlobalStartValue(i)); - EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelTimestampsData[j].getContextEndValue(i)); - EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelTimestampsData[j].getGlobalEndValue(i)); + EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelEventCompletionData[j].getContextStartValue(i)); + EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelEventCompletionData[j].getGlobalStartValue(i)); + EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelEventCompletionData[j].getContextEndValue(i)); + EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelEventCompletionData[j].getGlobalEndValue(i)); } - EXPECT_EQ(1u, event->kernelTimestampsData[j].getPacketsUsed()); + EXPECT_EQ(1u, event->kernelEventCompletionData[j].getPacketsUsed()); } EXPECT_EQ(1u, event->kernelCount); @@ -692,7 +692,7 @@ TEST_F(TimestampEventCreate, givenEventTimestampWhenPacketCountIsSetThenCorrectO } TEST_F(TimestampEventCreate, givenEventWhenSignaledAndResetFromTheHostThenCorrectDataAreSet) { - EXPECT_NE(nullptr, event->kernelTimestampsData); + EXPECT_NE(nullptr, event->kernelEventCompletionData); event->hostSignal(); ze_result_t result = event->queryStatus(); EXPECT_EQ(ZE_RESULT_SUCCESS, result); @@ -702,12 +702,12 @@ TEST_F(TimestampEventCreate, givenEventWhenSignaledAndResetFromTheHostThenCorrec EXPECT_EQ(ZE_RESULT_NOT_READY, result); for (auto j = 0u; j < EventPacketsCount::maxKernelSplit; j++) { for (auto i = 0u; i < NEO::TimestampPacketSizeControl::preferredPacketCount; i++) { - EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelTimestampsData[j].getContextStartValue(i)); - EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelTimestampsData[j].getGlobalStartValue(i)); - EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelTimestampsData[j].getContextEndValue(i)); - EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelTimestampsData[j].getGlobalEndValue(i)); + EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelEventCompletionData[j].getContextStartValue(i)); + EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelEventCompletionData[j].getGlobalStartValue(i)); + EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelEventCompletionData[j].getContextEndValue(i)); + EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelEventCompletionData[j].getGlobalEndValue(i)); } - EXPECT_EQ(1u, event->kernelTimestampsData[j].getPacketsUsed()); + EXPECT_EQ(1u, event->kernelEventCompletionData[j].getPacketsUsed()); } EXPECT_EQ(1u, event->kernelCount); } @@ -799,7 +799,7 @@ TEST_F(EventQueryTimestampExpWithSubDevice, givenEventWhenQuerytimestampExpWithS uint32_t numPackets = 2; for (uint32_t packetId = 0; packetId < numPackets; packetId++) { - event->kernelTimestampsData[0].assignDataToAllTimestamps(packetId, event->hostAddress); + event->kernelEventCompletionData[0].assignDataToAllTimestamps(packetId, event->hostAddress); event->hostAddress = ptrOffset(event->hostAddress, NEO::TimestampPackets::getSinglePacketSize()); } uint32_t pCount = 0; @@ -865,7 +865,7 @@ TEST_F(TimestampEventCreate, givenEventWhenQueryingTimestampExpThenCorrectDataSe uint32_t pCount = 2; for (uint32_t packetId = 0; packetId < pCount; packetId++) { - event->kernelTimestampsData[0].assignDataToAllTimestamps(packetId, event->hostAddress); + event->kernelEventCompletionData[0].assignDataToAllTimestamps(packetId, event->hostAddress); event->hostAddress = ptrOffset(event->hostAddress, NEO::TimestampPackets::getSinglePacketSize()); } diff --git a/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp b/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp index a026a89836..6eafe35039 100644 --- a/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp @@ -143,6 +143,146 @@ HWTEST2_F(CommandListAppendLaunchKernelWithAtomics, givenKernelWithGlobalAtomics EXPECT_FALSE(pCommandList->commandContainer.lastSentUseGlobalAtomics); } +using CommandListAppendLaunchKernelL3Flush = Test; +HWTEST2_F(CommandListAppendLaunchKernelL3Flush, givenKernelWithRegularEventAndWithWalkerPartitionThenProperCommandsEncoded, IsXeHpCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + + DebugManagerStateRestore restorer; + DebugManager.flags.EnableWalkerPartition.set(1); + Mock<::L0::Kernel> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + kernel.setGroupSize(1, 1, 1); + ze_group_count_t groupCount{8, 1, 1}; + auto pCommandList = std::make_unique>>(); + auto result = pCommandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc)); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, event->toHandle(), false, false, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(pCommandList->commandContainer.getCommandStream()->getCpuBase(), 0), pCommandList->commandContainer.getCommandStream()->getUsed())); + + EXPECT_LT(1u, pCommandList->partitionCount); + auto itorLri = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorLri); + auto itorPC = findAll(cmdList.begin(), cmdList.end()); + ASSERT_NE(0u, itorPC.size()); + uint32_t postSyncCount = 0u; + for (auto it : itorPC) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncCount++; + } + } + ASSERT_LE(1u, postSyncCount); +} + +HWTEST2_F(CommandListAppendLaunchKernelL3Flush, givenKernelWithTimestampEventAndWithWalkerPartitionThenProperCommandsEncoded, IsXeHpCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + + DebugManagerStateRestore restorer; + DebugManager.flags.EnableWalkerPartition.set(1); + Mock<::L0::Kernel> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + kernel.setGroupSize(1, 1, 1); + ze_group_count_t groupCount{8, 1, 1}; + auto pCommandList = std::make_unique>>(); + auto result = pCommandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc)); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, event->toHandle(), false, false, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(pCommandList->commandContainer.getCommandStream()->getCpuBase(), 0), pCommandList->commandContainer.getCommandStream()->getUsed())); + + EXPECT_LT(1u, pCommandList->partitionCount); + auto itorLri = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorLri); + auto itorPC = findAll(cmdList.begin(), cmdList.end()); + ASSERT_NE(0u, itorPC.size()); + uint32_t postSyncCount = 0u; + for (auto it : itorPC) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncCount++; + } + } + ASSERT_LE(1u, postSyncCount); +} + +HWTEST2_F(CommandListAppendLaunchKernelL3Flush, givenKernelWithEventAndWithoutWalkerPartitionThenProperCommandsEncoded, IsXeHpCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + DebugManagerStateRestore restorer; + DebugManager.flags.EnableWalkerPartition.set(0); + Mock<::L0::Kernel> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + kernel.setGroupSize(1, 1, 1); + ze_group_count_t groupCount{8, 1, 1}; + auto pCommandList = std::make_unique>>(); + auto result = pCommandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc)); + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, event->toHandle(), false, false, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(pCommandList->commandContainer.getCommandStream()->getCpuBase(), 0), pCommandList->commandContainer.getCommandStream()->getUsed())); + + EXPECT_EQ(1u, pCommandList->partitionCount); + auto itorLri = find(cmdList.begin(), cmdList.end()); + ASSERT_EQ(cmdList.end(), itorLri); +} + HWTEST2_F(CommandListCreate, WhenCreatingCommandListThenBindingTablePoolAllocAddedToBatchBuffer, IsXeHpCore) { using _3DSTATE_BINDING_TABLE_POOL_ALLOC = typename FamilyType::_3DSTATE_BINDING_TABLE_POOL_ALLOC;