diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index a2254b35b4..32f6ed4a45 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1242,7 +1242,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu gpuAddr = event->getGpuAddress(); if (event->isTimestampEvent) { - gpuAddr += event->getOffsetOfEventTimestampRegister(Event::CONTEXT_END); + gpuAddr += offsetof(KernelTimestampEvent, contextEnd); } NEO::HardwareCommandsHelper::programMiSemaphoreWait(*(commandContainer.getCommandStream()), diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index d2948942b7..47e2909edb 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -87,37 +87,34 @@ void CommandListCoreFamily::appendEventForProfiling(ze_event_hand return; } - uint64_t timeStampAddress = 0; - using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; commandContainer.addToResidencyContainer(&event->getAllocation()); + auto baseAddr = event->getGpuAddress(); + if (beforeWalker) { - timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::GLOBAL_START); - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, timeStampAddress); + auto contextStartAddr = baseAddr; + auto globalStartAddr = baseAddr + offsetof(KernelTimestampEvent, globalStart); - timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::CONTEXT_START); - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timeStampAddress); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, globalStartAddr); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddr); } else { - - timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::GLOBAL_END); - NEO::PipeControlArgs args; - args.dcFlushEnable = (event->signalScope == ZE_EVENT_SCOPE_FLAG_NONE) ? false : true; + auto contextEndAddr = baseAddr + offsetof(KernelTimestampEvent, contextEnd); + auto globalEndAddr = baseAddr + offsetof(KernelTimestampEvent, globalEnd); if (isCopyOnlyCmdList) { - NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), timeStampAddress, 0llu, true, true); + NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), globalEndAddr, 0llu, true, true); } else { - NEO::MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( - *(commandContainer.getCommandStream()), POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP, - timeStampAddress, - 0llu, - device->getHwInfo(), - args); + NEO::PipeControlArgs args; + args.dcFlushEnable = false; - timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::CONTEXT_END); - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timeStampAddress); + NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, globalEndAddr); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddr); + + args.dcFlushEnable = (event->signalScope == ZE_EVENT_SCOPE_FLAG_NONE) ? false : true; if (args.dcFlushEnable) { NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); } diff --git a/level_zero/core/source/event/event.cpp b/level_zero/core/source/event/event.cpp index b6485d51e2..151a5721f5 100644 --- a/level_zero/core/source/event/event.cpp +++ b/level_zero/core/source/event/event.cpp @@ -51,7 +51,7 @@ struct EventImp : public Event { if (isTimestampEvent) { auto baseAddr = reinterpret_cast(hostAddress); - auto timeStampAddress = baseAddr + getOffsetOfEventTimestampRegister(Event::CONTEXT_END); + auto timeStampAddress = baseAddr + offsetof(KernelTimestampEvent, contextEnd); hostAddr = reinterpret_cast(timeStampAddress); } @@ -80,10 +80,8 @@ struct EventPoolImp : public EventPool { pool[i] = EventPool::EVENT_STATE_INITIAL; } - auto timestampMultiplier = 1; if (flags & ZE_EVENT_POOL_FLAG_TIMESTAMP) { isEventPoolUsedForTimestamp = true; - timestampMultiplier = numEventTimestampsToRead; } ze_device_handle_t hDevice; @@ -98,7 +96,7 @@ struct EventPoolImp : public EventPool { device = Device::fromHandle(hDevice); NEO::AllocationProperties properties( - device->getRootDeviceIndex(), count * eventSize * timestampMultiplier, + device->getRootDeviceIndex(), count * eventSize, NEO::GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY); properties.alignment = MemoryConstants::cacheLineSize; eventPoolAllocation = driver->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties); @@ -142,8 +140,6 @@ struct EventPoolImp : public EventPool { uint32_t getEventSize() override { return eventSize; } - uint32_t getNumEventTimestampsToRead() override { return numEventTimestampsToRead; } - ze_result_t destroyPool() { if (eventPoolUsedCount != 0) { return ZE_RESULT_ERROR_INVALID_ARGUMENT; @@ -164,9 +160,8 @@ struct EventPoolImp : public EventPool { std::queue lastEventPoolOffsetUsed; protected: - const uint32_t eventSize = 16u; + const uint32_t eventSize = sizeof(struct KernelTimestampEvent); const uint32_t eventAlignment = MemoryConstants::cacheLineSize; - const int32_t numEventTimestampsToRead = 4u; }; Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *device) { @@ -220,17 +215,22 @@ void EventImp::makeAllocationResident() { } ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) { - for (uint32_t i = 0; i < this->eventPool->getNumEventTimestampsToRead(); i++) { - auto baseAddr = reinterpret_cast(hostAddress); - auto timeStampAddress = baseAddr + getOffsetOfEventTimestampRegister(i); - auto tsptr = reinterpret_cast(timeStampAddress); - *(tsptr) = eventVal; + auto baseAddr = reinterpret_cast(hostAddress); + auto signalScopeFlag = this->signalScope; - if (this->signalScope != ZE_EVENT_SCOPE_FLAG_NONE) { + auto eventTsSetFunc = [&](auto tsAddr) { + auto tsptr = reinterpret_cast(tsAddr); + memcpy_s(tsptr, sizeof(uint32_t), static_cast(&eventVal), sizeof(uint32_t)); + if (signalScopeFlag != ZE_EVENT_SCOPE_FLAG_NONE) { NEO::CpuIntrinsics::clFlush(tsptr); } - } + }; + + eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextStart)); + eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart)); + eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextEnd)); + eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd)); makeAllocationResident(); @@ -239,7 +239,7 @@ ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) { ze_result_t EventImp::hostEventSetValue(uint32_t eventVal) { if (isTimestampEvent) { - hostEventSetValueTimestamps(eventVal); + return hostEventSetValueTimestamps(eventVal); } auto hostAddr = static_cast(hostAddress); @@ -303,7 +303,7 @@ ze_result_t EventImp::reset() { ze_result_t EventImp::getTimestamp(ze_event_timestamp_type_t timestampType, void *dstptr) { auto baseAddr = reinterpret_cast(hostAddress); - uint64_t *tsptr = nullptr; + uint64_t tsAddr = 0u; constexpr uint64_t tsMask = (1ull << 32) - 1; uint64_t tsData = Event::STATE_INITIAL & tsMask; @@ -316,17 +316,19 @@ ze_result_t EventImp::getTimestamp(ze_event_timestamp_type_t timestampType, void return ZE_RESULT_SUCCESS; } - if (timestampType == ZE_EVENT_TIMESTAMP_GLOBAL_START) { - tsptr = reinterpret_cast(baseAddr + getOffsetOfEventTimestampRegister(Event::GLOBAL_START)); - } else if (timestampType == ZE_EVENT_TIMESTAMP_GLOBAL_END) { - tsptr = reinterpret_cast(baseAddr + getOffsetOfEventTimestampRegister(Event::GLOBAL_END)); - } else if (timestampType == ZE_EVENT_TIMESTAMP_CONTEXT_START) { - tsptr = reinterpret_cast(baseAddr + getOffsetOfEventTimestampRegister(Event::CONTEXT_START)); + if (timestampType == ZE_EVENT_TIMESTAMP_CONTEXT_START) { + tsAddr = baseAddr + offsetof(KernelTimestampEvent, contextStart); + } else if (timestampType == ZE_EVENT_TIMESTAMP_GLOBAL_START) { + tsAddr = baseAddr + offsetof(KernelTimestampEvent, globalStart); + } else if (timestampType == ZE_EVENT_TIMESTAMP_CONTEXT_END) { + tsAddr = baseAddr + offsetof(KernelTimestampEvent, contextEnd); } else { - tsptr = reinterpret_cast(baseAddr + getOffsetOfEventTimestampRegister(Event::CONTEXT_END)); + tsAddr = baseAddr + offsetof(KernelTimestampEvent, globalEnd); } - tsData = (*tsptr & tsMask); + memcpy_s(static_cast(&tsData), sizeof(uint32_t), reinterpret_cast(tsAddr), sizeof(uint32_t)); + + tsData &= tsMask; memcpy_s(dstptr, sizeof(uint64_t), static_cast(&tsData), sizeof(uint64_t)); return ZE_RESULT_SUCCESS; @@ -353,14 +355,9 @@ ze_result_t EventPoolImp::reserveEventFromPool(int index, Event *event) { lastEventPoolOffsetUsed.pop(); } - auto timestampMultiplier = 1; - if (static_cast(this)->isEventPoolUsedForTimestamp) { - timestampMultiplier = numEventTimestampsToRead; - } - uint64_t baseHostAddr = reinterpret_cast(eventPoolAllocation->getUnderlyingBuffer()); - event->hostAddress = reinterpret_cast(baseHostAddr + (event->offsetUsed * eventSize * timestampMultiplier)); - event->gpuAddress = eventPoolAllocation->getGpuAddress() + (event->offsetUsed * eventSize * timestampMultiplier); + event->hostAddress = reinterpret_cast(baseHostAddr + (event->offsetUsed * eventSize)); + event->gpuAddress = eventPoolAllocation->getGpuAddress() + (event->offsetUsed * eventSize); eventPoolUsedCount++; diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index ca6b10d9e5..9d82870aa1 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -36,13 +36,6 @@ struct Event : _ze_event_handle_t { STATE_INITIAL = STATE_CLEARED }; - enum EventTimestampRegister : uint32_t { - GLOBAL_START = 0u, - GLOBAL_END, - CONTEXT_START, - CONTEXT_END - }; - static Event *create(EventPool *eventPool, const ze_event_desc_t *desc, Device *device); static Event *fromHandle(ze_event_handle_t handle) { return static_cast(handle); } @@ -72,6 +65,13 @@ struct Event : _ze_event_handle_t { NEO::GraphicsAllocation *allocation = nullptr; }; +struct KernelTimestampEvent { + uint32_t contextStart = Event::STATE_INITIAL; + uint32_t globalStart = Event::STATE_INITIAL; + uint32_t contextEnd = Event::STATE_INITIAL; + uint32_t globalEnd = Event::STATE_INITIAL; +}; + struct EventPool : _ze_event_pool_handle_t { static EventPool *create(DriverHandle *driver, uint32_t numDevices, ze_device_handle_t *phDevices, const ze_event_pool_desc_t *desc); virtual ~EventPool() = default; @@ -100,7 +100,6 @@ struct EventPool : _ze_event_pool_handle_t { virtual NEO::GraphicsAllocation &getAllocation() { return *eventPoolAllocation; } virtual uint32_t getEventSize() = 0; - virtual uint32_t getNumEventTimestampsToRead() = 0; bool isEventPoolUsedForTimestamp = false; diff --git a/level_zero/core/test/unit_tests/mocks/mock_event.h b/level_zero/core/test/unit_tests/mocks/mock_event.h index a8fa8f752f..6d6b859ba7 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_event.h +++ b/level_zero/core/test/unit_tests/mocks/mock_event.h @@ -71,7 +71,6 @@ struct Mock : public EventPool { MOCK_METHOD1(releaseEventToPool, ze_result_t(::L0::Event *event)); MOCK_METHOD0(getDevice, Device *()); MOCK_METHOD0(getEventSize, uint32_t()); - MOCK_METHOD0(getNumEventTimestampsToRead, uint32_t()); std::vector pool; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp index 4d0e2ae475..d23f83932f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp @@ -289,22 +289,22 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTimestampEventsWhenAppendingKernel ASSERT_NE(cmdList.end(), itor); itor++; - auto itorPC = findAll(cmdList.begin(), cmdList.end()); - EXPECT_NE(0u, itorPC.size()); - bool postSyncFound = false; - for (auto it : itorPC) { - auto cmd = genCmdCast(*it); - if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP) { - EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); - EXPECT_FALSE(cmd->getDcFlushEnable()); - auto gpuAddress = event->getGpuAddress() + - event->getOffsetOfEventTimestampRegister(Event::GLOBAL_END); - EXPECT_EQ(cmd->getAddressHigh(), gpuAddress >> 32u); - EXPECT_EQ(cmd->getAddress(), uint32_t(gpuAddress)); - postSyncFound = true; - } + itor = find(itor, cmdList.end()); + ASSERT_NE(cmdList.end(), itor); + { + auto cmd = genCmdCast(*itor); + EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); + EXPECT_FALSE(cmd->getDcFlushEnable()); } - EXPECT_TRUE(postSyncFound); + itor++; + + itor = find(itor, cmdList.end()); + ASSERT_NE(cmdList.end(), itor); + { + auto cmd = genCmdCast(*itor); + EXPECT_EQ(REG_GLOBAL_TIMESTAMP_LDW, cmd->getRegisterAddress()); + } + itor++; itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); @@ -321,6 +321,49 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTimestampEventsWhenAppendingKernel } } +HWTEST2_F(CommandListAppendLaunchKernel, givenKernelLaunchWithTSEventAndScopeFlagHostThenPCWithDCFlushEncoded, TimestampEventSupport) { + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + + Mock<::L0::Kernel> kernel; + std::unique_ptr commandList(L0::CommandList::create(productFamily, device, false)); + auto usedSpaceBefore = commandList->commandContainer.getCommandStream()->getUsed(); + ze_event_pool_desc_t eventPoolDesc = { + ZE_EVENT_POOL_DESC_VERSION_CURRENT, + ZE_EVENT_POOL_FLAG_TIMESTAMP, + 1}; + + ze_event_desc_t eventDesc = { + ZE_EVENT_DESC_VERSION_CURRENT, + 0, + ZE_EVENT_SCOPE_FLAG_HOST, + ZE_EVENT_SCOPE_FLAG_HOST}; + + auto eventPool = std::unique_ptr(EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc)); + auto event = std::unique_ptr(Event::create(eventPool.get(), &eventDesc, device)); + + ze_group_count_t groupCount{1, 1, 1}; + auto result = commandList->appendLaunchKernel( + kernel.toHandle(), &groupCount, event->toHandle(), 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto usedSpaceAfter = commandList->commandContainer.getCommandStream()->getUsed(); + EXPECT_GT(usedSpaceAfter, usedSpaceBefore); + + GenCmdList cmdList; + EXPECT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), usedSpaceAfter)); + + auto itorPC = findAll(cmdList.begin(), cmdList.end()); + ASSERT_NE(0u, itorPC.size()); + + PIPE_CONTROL *cmd = genCmdCast(*itorPC[itorPC.size() - 1]); + EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); + EXPECT_TRUE(cmd->getDcFlushEnable()); +} + HWTEST2_F(CommandListAppendLaunchKernel, givenImmediateCommandListWhenAppendingLaunchKernelThenKernelIsExecutedOnImmediateCmdQ, SklPlusMatcher) { createKernel(); diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index 43c3cf5d39..801886874d 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -33,17 +33,17 @@ TEST_F(EventPoolCreate, allocationContainsAtLeast16Bytes) { EXPECT_GE(allocation->getUnderlyingBufferSize(), minAllocationSize); } -TEST_F(EventPoolCreate, givenTimestampEventsThenVerifyNumTimestampsToRead) { +TEST_F(EventPoolCreate, givenTimestampEventsThenEventSizeSufficientForAllKernelTimestamps) { ze_event_pool_desc_t eventPoolDesc = { ZE_EVENT_POOL_DESC_VERSION_CURRENT, - ZE_EVENT_POOL_FLAG_TIMESTAMP, // all events in pool are visible to Host + ZE_EVENT_POOL_FLAG_TIMESTAMP, 1}; std::unique_ptr eventPool(EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc)); ASSERT_NE(nullptr, eventPool); - uint32_t numTimestamps = 4u; - EXPECT_EQ(numTimestamps, eventPool->getNumEventTimestampsToRead()); + uint32_t kernelTimestampsSize = sizeof(struct KernelTimestampEvent); + EXPECT_EQ(kernelTimestampsSize, eventPool->getEventSize()); } TEST_F(EventPoolCreate, givenAnEventIsCreatedFromThisEventPoolThenEventContainsDeviceCommandStreamReceiver) { @@ -165,8 +165,7 @@ TEST_F(TimestampEventCreate, givenSingleTimestampEventThenAllocationSizeCreatedF auto allocation = &eventPool->getAllocation(); ASSERT_NE(nullptr, allocation); - uint32_t minTimestampEventAllocation = eventPool->getEventSize() * - eventPool->getNumEventTimestampsToRead(); + uint32_t minTimestampEventAllocation = eventPool->getEventSize(); EXPECT_GE(minTimestampEventAllocation, allocation->getUnderlyingBufferSize()); }