diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 162a24b82a..a01ff29938 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -167,7 +167,7 @@ struct CommandList : _ze_command_list_handle_t { virtual ze_result_t appendMINoop() = 0; virtual ze_result_t appendPipeControl(void *dstPtr, uint64_t value) = 0; virtual ze_result_t appendWaitOnMemory(void *desc, void *ptr, - uint32_t data, ze_event_handle_t hSignalEvent) = 0; + uint32_t data, ze_event_handle_t signalEventHandle) = 0; virtual ze_result_t appendWriteToMemory(void *desc, void *ptr, uint64_t data) = 0; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index aaca63a0ec..65c82dcf4e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -156,7 +156,7 @@ struct CommandListCoreFamily : CommandListImp { ze_result_t appendMINoop() override; ze_result_t appendPipeControl(void *dstPtr, uint64_t value) override; ze_result_t appendWaitOnMemory(void *desc, void *ptr, - uint32_t data, ze_event_handle_t hSignalEvent) override; + uint32_t data, ze_event_handle_t signalEventHandle) override; ze_result_t appendWriteToMemory(void *desc, void *ptr, uint64_t data) override; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index a0c95d93f5..5fa5e94733 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2589,7 +2589,7 @@ template ze_result_t CommandListCoreFamily::appendWaitOnMemory(void *desc, void *ptr, uint32_t data, - ze_event_handle_t hSignalEvent) { + ze_event_handle_t signalEventHandle) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; auto descriptor = reinterpret_cast(desc); @@ -2617,11 +2617,19 @@ ze_result_t CommandListCoreFamily::appendWaitOnMemory(void *desc, return ZE_RESULT_ERROR_INVALID_ARGUMENT; } + Event *signalEvent = nullptr; + if (signalEventHandle) { + signalEvent = Event::fromHandle(signalEventHandle); + } + auto srcAllocationStruct = getAlignedAllocation(this->device, ptr, sizeof(uint32_t), true); if (srcAllocationStruct.alloc == nullptr) { return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; } UNRECOVERABLE_IF(srcAllocationStruct.alloc == nullptr); + + appendEventForProfiling(signalEvent, true); + commandContainer.addToResidencyContainer(srcAllocationStruct.alloc); uint64_t gpuAddress = static_cast(srcAllocationStruct.alignedAllocationPtr); NEO::EncodeSempahore::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(), @@ -2638,27 +2646,8 @@ ze_result_t CommandListCoreFamily::appendWaitOnMemory(void *desc, NEO::MemorySynchronizationCommands::addAdditionalSynchronization(*commandContainer.getCommandStream(), gpuAddress, true, hwInfo); } - if (hSignalEvent) { - auto event = Event::fromHandle(hSignalEvent); + appendSignalEventPostWalker(signalEvent); - commandContainer.addToResidencyContainer(&event->getAllocation(this->device)); - uint64_t eventGpuAddr = event->getCompletionFieldGpuAddress(this->device); - - if (isCopyOnly()) { - NEO::MiFlushArgs args; - args.commandWithPostSync = true; - NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), eventGpuAddr, - Event::STATE_SIGNALED, args, hwInfo); - } else { - NEO::PipeControlArgs args; - args.dcFlushEnable = getDcFlushRequired(!!event->signalScope); - NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( - *commandContainer.getCommandStream(), NEO::PostSyncMode::ImmediateData, - eventGpuAddr, Event::STATE_SIGNALED, - hwInfo, - args); - } - } return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index 8e6a31400b..455706e765 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -166,10 +166,10 @@ struct Event : _ze_event_handle_t { return maxKernelCount; } - uint64_t globalStartTS; - uint64_t globalEndTS; - uint64_t contextStartTS; - uint64_t contextEndTS; + uint64_t globalStartTS = 1; + uint64_t globalEndTS = 1; + uint64_t contextStartTS = 1; + uint64_t contextEndTS = 1; std::chrono::microseconds gpuHangCheckPeriod{500'000}; // Metric streamer instance associated with the event. @@ -205,10 +205,11 @@ struct Event : _ze_event_handle_t { uint32_t maxPacketCount = 0; uint32_t totalEventSize = 0; + std::atomic isCompleted{STATE_INITIAL}; + bool isTimestampEvent = false; bool usingContextEndOffset = false; bool signalAllEventPackets = false; - std::atomic isCompleted{STATE_INITIAL}; }; template diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 0346646d3c..4eb3833d19 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -405,7 +405,7 @@ struct MockCommandList : public CommandList { uint64_t value)); ADDMETHOD_NOBASE(appendWaitOnMemory, ze_result_t, ZE_RESULT_SUCCESS, (void *desc, void *ptr, - uint32_t data, ze_event_handle_t hSignalEvent)); + uint32_t data, ze_event_handle_t signalEventHandle)); ADDMETHOD_NOBASE(appendWriteToMemory, ze_result_t, ZE_RESULT_SUCCESS, (void *desc, void *ptr, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp index 2c526f0f8a..f7278df488 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp @@ -12,6 +12,7 @@ #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/api/driver_experimental/public/zex_api.h" +#include "level_zero/core/source/hw_helpers/l0_hw_helper.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" @@ -47,6 +48,8 @@ class CommandListWaitOnMemFixture : public DeviceFixture { size, alignment, &ptr); EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_NE(nullptr, ptr); + + signalAllPackets = L0GfxCoreHelper::useSignalAllEventPackets(device->getHwInfo()); } void tearDown() { @@ -62,8 +65,9 @@ class CommandListWaitOnMemFixture : public DeviceFixture { std::unique_ptr commandListBcs; std::unique_ptr eventPool; std::unique_ptr event; - uint32_t waitMemData = 1u; void *ptr = nullptr; + uint32_t waitMemData = 1u; + bool signalAllPackets = false; }; template @@ -393,6 +397,7 @@ HWTEST_F(CommandListAppendWaitOnMem, givenAppendWaitOnMemWithSignalEventAndHostS using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; ze_result_t result = ZE_RESULT_SUCCESS; auto &commandContainer = commandList->commandContainer; @@ -421,19 +426,40 @@ HWTEST_F(CommandListAppendWaitOnMem, givenAppendWaitOnMemWithSignalEventAndHostS ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); + auto gpuAddress = event->getCompletionFieldGpuAddress(this->device); + + size_t expectedPostSyncStoreDataImm = 0; + uint64_t storeDataImmAddress = gpuAddress; + if (signalAllPackets) { + expectedPostSyncStoreDataImm = event->getMaxPacketsCount() - 1; + } + + auto itorStoreDataImm = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(expectedPostSyncStoreDataImm, itorStoreDataImm.size()); + + for (size_t i = 0; i < expectedPostSyncStoreDataImm; i++) { + auto cmd = genCmdCast(*itorStoreDataImm[i]); + EXPECT_EQ(storeDataImmAddress, cmd->getAddress()); + EXPECT_FALSE(cmd->getStoreQword()); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getDataDword0()); + storeDataImmAddress += event->getSinglePacketSize(); + } + auto itor = find(cmdList.begin(), cmdList.end()); EXPECT_NE(cmdList.end(), itor); + itor++; auto itorPC = findAll(itor, cmdList.end()); ASSERT_NE(0u, itorPC.size()); + + auto pipeControlAddress = storeDataImmAddress; bool postSyncFound = false; for (auto it : itorPC) { auto cmd = genCmdCast(*it); if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED); - auto gpuAddress = event->getCompletionFieldGpuAddress(this->device); - EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(pipeControlAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); EXPECT_EQ(NEO::MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo), cmd->getDcFlushEnable()); postSyncFound = true; } @@ -445,6 +471,7 @@ HWTEST_F(CommandListAppendWaitOnMem, givenAppendWaitOnMemWithSignalEventAndNoSco using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; ze_result_t result = ZE_RESULT_SUCCESS; auto &commandContainer = commandList->commandContainer; @@ -471,19 +498,40 @@ HWTEST_F(CommandListAppendWaitOnMem, givenAppendWaitOnMemWithSignalEventAndNoSco ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); + auto gpuAddress = event->getCompletionFieldGpuAddress(this->device); + + size_t expectedPostSyncStoreDataImm = 0; + uint64_t storeDataImmAddress = gpuAddress; + if (signalAllPackets) { + expectedPostSyncStoreDataImm = event->getMaxPacketsCount() - 1; + } + + auto itorStoreDataImm = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(expectedPostSyncStoreDataImm, itorStoreDataImm.size()); + + for (size_t i = 0; i < expectedPostSyncStoreDataImm; i++) { + auto cmd = genCmdCast(*itorStoreDataImm[i]); + EXPECT_EQ(storeDataImmAddress, cmd->getAddress()); + EXPECT_FALSE(cmd->getStoreQword()); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getDataDword0()); + storeDataImmAddress += event->getSinglePacketSize(); + } + auto itor = find(cmdList.begin(), cmdList.end()); EXPECT_NE(cmdList.end(), itor); + itor++; auto itorPC = findAll(itor, cmdList.end()); ASSERT_NE(0u, itorPC.size()); + + auto pipeControlAddress = storeDataImmAddress; bool postSyncFound = false; for (auto it : itorPC) { auto cmd = genCmdCast(*it); if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED); - auto gpuAddress = event->getCompletionFieldGpuAddress(this->device); - EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(pipeControlAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); EXPECT_FALSE(cmd->getDcFlushEnable()); postSyncFound = true; }