From 27d042107aeb127ffe030dba0e741b2065d08a90 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Thu, 27 Oct 2022 11:40:44 +0000 Subject: [PATCH] Add feature to estimate number of event packets Related-To: NEO-7469 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/CMakeLists.txt | 1 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 2 +- .../core/source/context/context_imp.cpp | 11 +- level_zero/core/source/context/context_imp.h | 9 + level_zero/core/source/device/device.h | 1 + level_zero/core/source/device/device_imp.cpp | 10 + level_zero/core/source/device/device_imp.h | 1 + level_zero/core/source/driver/driver_handle.h | 1 + .../core/source/driver/driver_handle_imp.cpp | 20 +- .../core/source/driver/driver_handle_imp.h | 1 + level_zero/core/source/event/event.cpp | 34 ++- level_zero/core/source/event/event.h | 20 +- level_zero/core/source/event/event_impl.inl | 16 +- .../core/source/hw_helpers/l0_hw_helper.cpp | 7 + .../core/source/hw_helpers/l0_hw_helper.h | 7 + .../hw_helpers/l0_hw_helper_pvc_and_later.inl | 20 -- .../hw_helpers/l0_hw_helper_skl_and_later.inl | 10 + .../l0_hw_helper_xehp_and_later.inl | 58 +++++ .../xe_hp_core/l0_hw_helper_xe_hp_core.cpp | 6 +- .../xe_hpc_core/l0_hw_helper_xe_hpc_core.cpp | 1 + .../xe_hpg_core/l0_hw_helper_xe_hpg_core.cpp | 6 +- .../core/test/unit_tests/mocks/mock_device.h | 1 + .../unit_tests/mocks/mock_driver_handle.h | 1 + .../core/test/unit_tests/mocks/mock_event.h | 2 + .../test_cmdlist_append_event_reset.cpp | 42 +++- .../test_cmdlist_append_wait_on_events.cpp | 2 + .../unit_tests/sources/event/test_event.cpp | 219 ++++++++++++++++-- .../sources/helper/l0_hw_helper_tests.cpp | 80 +++++++ .../xe_hp_core/test_cmdlist_xe_hp_core.cpp | 27 ++- .../xe_hpg_core/test_cmdlist_xe_hpg_core.cpp | 3 + .../debug_settings/debug_variables_base.inl | 1 + shared/test/common/test_files/igdrcl.config | 1 + 32 files changed, 545 insertions(+), 76 deletions(-) create mode 100644 level_zero/core/source/hw_helpers/l0_hw_helper_xehp_and_later.inl diff --git a/level_zero/core/source/CMakeLists.txt b/level_zero/core/source/CMakeLists.txt index fe00642916..c0722db9b1 100644 --- a/level_zero/core/source/CMakeLists.txt +++ b/level_zero/core/source/CMakeLists.txt @@ -99,6 +99,7 @@ if(SUPPORT_XEHP_AND_LATER) list(APPEND L0_RUNTIME_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cmdlist/cmdlist_hw_xehp_and_later.inl ${CMAKE_CURRENT_SOURCE_DIR}/cmdqueue/cmdqueue_xe_hp_core_and_later.inl + ${CMAKE_CURRENT_SOURCE_DIR}/hw_helpers/l0_hw_helper_xehp_and_later.inl ) endif() diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 8fa18e582b..49d05854ae 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -390,7 +390,7 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand } if (event->isEventTimestampFlagSet()) { - packetsToReset = EventPacketsCount::eventPackets; + packetsToReset = event->getMaxPacketsCount(); } event->resetPackets(); event->resetCompletion(); diff --git a/level_zero/core/source/context/context_imp.cpp b/level_zero/core/source/context/context_imp.cpp index 910ca972aa..46291a3ec3 100644 --- a/level_zero/core/source/context/context_imp.cpp +++ b/level_zero/core/source/context/context_imp.cpp @@ -577,12 +577,11 @@ ze_result_t ContextImp::openEventPoolIpcHandle(const ze_ipc_event_pool_handle_t auto device = Device::fromHandle(this->devices.begin()->second); auto neoDevice = device->getNEODevice(); NEO::osHandle osHandle = static_cast(handle); - auto &hwHelper = device->getHwHelper(); - const uint32_t eventAlignment = static_cast(hwHelper.getTimestampPacketAllocatorAlignment()); - uint32_t eventSize = static_cast(alignUp(EventPacketsCount::eventPackets * hwHelper.getSingleTimestampPacketSize(), eventAlignment)); - size_t alignedSize = alignUp(numEvents * eventSize, MemoryConstants::pageSize64k); + + eventPool->initializeSizeParameters(this->numDevices, this->deviceHandles.data(), *this->driverHandle, device->getHwInfo()); + NEO::AllocationProperties unifiedMemoryProperties{rootDeviceIndex, - alignedSize, + eventPool->getEventPoolSize(), NEO::AllocationType::BUFFER_HOST_MEMORY, systemMemoryBitfield}; @@ -605,8 +604,6 @@ ze_result_t ContextImp::openEventPoolIpcHandle(const ze_ipc_event_pool_handle_t eventPool->eventPoolPtr = reinterpret_cast(alloc->getUnderlyingBuffer()); eventPool->devices.push_back(device); eventPool->isImportedIpcPool = true; - eventPool->setEventSize(eventSize); - eventPool->setEventAlignment(eventAlignment); for (auto currDeviceIndex : this->rootDeviceIndices) { if (currDeviceIndex == rootDeviceIndex) { diff --git a/level_zero/core/source/context/context_imp.h b/level_zero/core/source/context/context_imp.h index d6ca7d581a..63a9a6548e 100644 --- a/level_zero/core/source/context/context_imp.h +++ b/level_zero/core/source/context/context_imp.h @@ -149,11 +149,20 @@ struct ContextImp : Context { bool isShareableMemory(const void *exportDesc, bool exportableMemory, NEO::Device *neoDevice) override; void *getMemHandlePtr(ze_device_handle_t hDevice, uint64_t handle, ze_ipc_memory_flags_t flags) override; + void initDeviceHandles(uint32_t numDevices, ze_device_handle_t *deviceHandles) { + this->numDevices = numDevices; + if (numDevices > 0) { + this->deviceHandles.assign(deviceHandles, deviceHandles + numDevices); + } + } + protected: bool isAllocationSuitableForCompression(const StructuresLookupTable &structuresLookupTable, Device &device, size_t allocSize); std::map devices; + std::vector deviceHandles; DriverHandleImp *driverHandle = nullptr; + uint32_t numDevices = 0; }; } // namespace L0 diff --git a/level_zero/core/source/device/device.h b/level_zero/core/source/device/device.h index 16b4ece696..7697976152 100644 --- a/level_zero/core/source/device/device.h +++ b/level_zero/core/source/device/device.h @@ -136,6 +136,7 @@ struct Device : _ze_device_handle_t { virtual NEO::GraphicsAllocation *obtainReusableAllocation(size_t requiredSize, NEO::AllocationType type) = 0; virtual void storeReusableAllocation(NEO::GraphicsAllocation &alloc) = 0; virtual ze_result_t getFabricVertex(ze_fabric_vertex_handle_t *phVertex) = 0; + virtual uint32_t getEventMaxPacketCount() const = 0; protected: NEO::Device *neoDevice = nullptr; diff --git a/level_zero/core/source/device/device_imp.cpp b/level_zero/core/source/device/device_imp.cpp index c12f4712ba..06e57cc7fb 100644 --- a/level_zero/core/source/device/device_imp.cpp +++ b/level_zero/core/source/device/device_imp.cpp @@ -1515,4 +1515,14 @@ ze_result_t DeviceImp::getFabricVertex(ze_fabric_vertex_handle_t *phVertex) { return ZE_RESULT_SUCCESS; } +uint32_t DeviceImp::getEventMaxPacketCount() const { + const auto &hardwareInfo = this->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); + + uint32_t basePackets = l0HwHelper.getEventBaseMaxPacketCount(hardwareInfo); + if (this->isImplicitScalingCapable()) { + basePackets *= static_cast(neoDevice->getDeviceBitfield().count()); + } + return basePackets; +} } // namespace L0 diff --git a/level_zero/core/source/device/device_imp.h b/level_zero/core/source/device/device_imp.h index 7bb44bf7a4..0f828aa7f1 100644 --- a/level_zero/core/source/device/device_imp.h +++ b/level_zero/core/source/device/device_imp.h @@ -141,6 +141,7 @@ struct DeviceImp : public Device { ze_result_t queryDeviceLuid(ze_device_luid_ext_properties_t *deviceLuidProperties); ze_result_t setDeviceLuid(ze_device_luid_ext_properties_t *deviceLuidProperties); + uint32_t getEventMaxPacketCount() const override; protected: void adjustCommandQueueDesc(uint32_t &ordinal, uint32_t &index); diff --git a/level_zero/core/source/driver/driver_handle.h b/level_zero/core/source/driver/driver_handle.h index 0c11d71061..606b0dc90f 100644 --- a/level_zero/core/source/driver/driver_handle.h +++ b/level_zero/core/source/driver/driver_handle.h @@ -66,6 +66,7 @@ struct DriverHandle : _ze_driver_handle_t { uint32_t rootDeviceIndex, uintptr_t *gpuAddress) = 0; virtual ze_result_t fabricVertexGetExp(uint32_t *pCount, ze_fabric_vertex_handle_t *phDevices) = 0; + virtual uint32_t getEventMaxPacketCount(uint32_t numDevices, ze_device_handle_t *deviceHandles) const = 0; static DriverHandle *fromHandle(ze_driver_handle_t handle) { return static_cast(handle); } inline ze_driver_handle_t toHandle() { return this; } diff --git a/level_zero/core/source/driver/driver_handle_imp.cpp b/level_zero/core/source/driver/driver_handle_imp.cpp index 381d62cc48..99daa7df70 100644 --- a/level_zero/core/source/driver/driver_handle_imp.cpp +++ b/level_zero/core/source/driver/driver_handle_imp.cpp @@ -58,7 +58,7 @@ ze_result_t DriverHandleImp::createContext(const ze_context_desc_t *desc, } *phContext = context->toHandle(); - + context->initDeviceHandles(numDevices, phDevices); if (numDevices == 0) { for (auto device : this->devices) { auto neoDevice = device->getNEODevice(); @@ -704,4 +704,22 @@ ze_result_t DriverHandleImp::fabricEdgeGetExp(ze_fabric_vertex_handle_t hVertexA return ZE_RESULT_SUCCESS; } +uint32_t DriverHandleImp::getEventMaxPacketCount(uint32_t numDevices, ze_device_handle_t *deviceHandles) const { + uint32_t maxCount = 0; + + if (numDevices == 0) { + for (auto device : this->devices) { + auto deviceMaxCount = device->getEventMaxPacketCount(); + maxCount = std::max(maxCount, deviceMaxCount); + } + } else { + for (uint32_t i = 0; i < numDevices; i++) { + auto deviceMaxCount = Device::fromHandle(deviceHandles[i])->getEventMaxPacketCount(); + maxCount = std::max(maxCount, deviceMaxCount); + } + } + + return maxCount; +} + } // namespace L0 diff --git a/level_zero/core/source/driver/driver_handle_imp.h b/level_zero/core/source/driver/driver_handle_imp.h index b52805af31..175800aec5 100644 --- a/level_zero/core/source/driver/driver_handle_imp.h +++ b/level_zero/core/source/driver/driver_handle_imp.h @@ -82,6 +82,7 @@ struct DriverHandleImp : public DriverHandle { Device *device); ze_result_t fabricEdgeGetExp(ze_fabric_vertex_handle_t hVertexA, ze_fabric_vertex_handle_t hVertexB, uint32_t *pCount, ze_fabric_edge_handle_t *phEdges); + uint32_t getEventMaxPacketCount(uint32_t numDevices, ze_device_handle_t *deviceHandles) const override; std::unique_ptr hostPointerManager; // Experimental functions diff --git a/level_zero/core/source/event/event.cpp b/level_zero/core/source/event/event.cpp index 5c7adbb808..9b3cc5183d 100644 --- a/level_zero/core/source/event/event.cpp +++ b/level_zero/core/source/event/event.cpp @@ -43,17 +43,18 @@ ze_result_t EventPoolImp::initialize(DriverHandle *driver, Context *context, uin RootDeviceIndicesContainer rootDeviceIndices; uint32_t maxRootDeviceIndex = 0u; + uint32_t currentNumDevices = numDevices; DriverHandleImp *driverHandleImp = static_cast(driver); bool useDevicesFromApi = true; bool useDeviceAlloc = isEventPoolDeviceAllocationFlagSet(); if (numDevices == 0) { - numDevices = static_cast(driverHandleImp->devices.size()); + currentNumDevices = static_cast(driverHandleImp->devices.size()); useDevicesFromApi = false; } - for (uint32_t i = 0u; i < numDevices; i++) { + for (uint32_t i = 0u; i < currentNumDevices; i++) { Device *eventDevice = nullptr; if (useDevicesFromApi) { @@ -74,14 +75,11 @@ ze_result_t EventPoolImp::initialize(DriverHandle *driver, Context *context, uin } rootDeviceIndices.remove_duplicates(); - auto &hwHelper = devices[0]->getHwHelper(); + auto &hwInfo = getDevice()->getHwInfo(); + useDeviceAlloc |= L0HwHelper::get(hwInfo.platform.eRenderCoreFamily).alwaysAllocateEventInLocalMem(); - useDeviceAlloc |= L0HwHelper::get(getDevice()->getHwInfo().platform.eRenderCoreFamily).alwaysAllocateEventInLocalMem(); + initializeSizeParameters(numDevices, phDevices, *driverHandleImp, hwInfo); - eventAlignment = static_cast(hwHelper.getTimestampPacketAllocatorAlignment()); - eventSize = static_cast(alignUp(EventPacketsCount::eventPackets * hwHelper.getSingleTimestampPacketSize(), eventAlignment)); - - size_t alignedSize = alignUp(numEvents * eventSize, MemoryConstants::pageSize64k); NEO::AllocationType allocationType = isEventPoolTimestampFlagSet() ? NEO::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER : NEO::AllocationType::BUFFER_HOST_MEMORY; if (this->devices.size() > 1) { @@ -97,7 +95,7 @@ ze_result_t EventPoolImp::initialize(DriverHandle *driver, Context *context, uin bool allocatedMemory = false; if (useDeviceAlloc) { - NEO::AllocationProperties allocationProperties{*rootDeviceIndices.begin(), alignedSize, allocationType, devices[0]->getNEODevice()->getDeviceBitfield()}; + NEO::AllocationProperties allocationProperties{*rootDeviceIndices.begin(), this->eventPoolSize, allocationType, devices[0]->getNEODevice()->getDeviceBitfield()}; allocationProperties.alignment = eventAlignment; if (eventPoolFlags & ZE_EVENT_POOL_FLAG_IPC) { this->isShareableEventMemory = true; @@ -110,7 +108,7 @@ ze_result_t EventPoolImp::initialize(DriverHandle *driver, Context *context, uin } } else { - NEO::AllocationProperties allocationProperties{*rootDeviceIndices.begin(), alignedSize, allocationType, systemMemoryBitfield}; + NEO::AllocationProperties allocationProperties{*rootDeviceIndices.begin(), this->eventPoolSize, allocationType, systemMemoryBitfield}; allocationProperties.alignment = eventAlignment; eventPoolPtr = driver->getMemoryManager()->createMultiGraphicsAllocationInSystemMemoryPool(rootDeviceIndices, @@ -158,6 +156,22 @@ ze_result_t EventPoolImp::createEvent(const ze_event_desc_t *desc, ze_event_hand return ZE_RESULT_SUCCESS; } +void EventPoolImp::initializeSizeParameters(uint32_t numDevices, ze_device_handle_t *deviceHandles, DriverHandleImp &driver, const NEO::HardwareInfo &hwInfo) { + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + setEventAlignment(static_cast(hwHelper.getTimestampPacketAllocatorAlignment())); + + bool useDynamicEventPackets = l0HwHelper.useDynamicEventPacketsCount(hwInfo); + eventPackets = EventPacketsCount::eventPackets; + if (useDynamicEventPackets) { + eventPackets = driver.getEventMaxPacketCount(numDevices, deviceHandles); + } + setEventSize(static_cast(alignUp(eventPackets * hwHelper.getSingleTimestampPacketSize(), eventAlignment))); + + eventPoolSize = alignUp(this->numEvents * eventSize, MemoryConstants::pageSize64k); +} + ze_result_t Event::destroy() { delete this; return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index e14b0ce17d..a2cc143d16 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -26,6 +26,7 @@ struct MetricStreamer; struct ContextImp; struct Context; struct DriverHandle; +struct DriverHandleImp; struct Device; namespace EventPacketsCount { @@ -104,7 +105,7 @@ struct Event : _ze_event_handle_t { void increaseKernelCount() { kernelCount++; - UNRECOVERABLE_IF(kernelCount > EventPacketsCount::maxKernelSplit); + UNRECOVERABLE_IF(kernelCount > maxKernelCount); } uint32_t getKernelCount() const { return kernelCount; @@ -123,6 +124,16 @@ struct Event : _ze_event_handle_t { this->isCompleted = false; } + uint32_t getMaxPacketsCount() const { + return maxPacketCount; + } + void setMaxKernelCount(uint32_t value) { + maxKernelCount = value; + } + uint32_t getMaxKernelCount() const { + return maxKernelCount; + } + uint64_t globalStartTS; uint64_t globalEndTS; uint64_t contextStartTS; @@ -152,7 +163,9 @@ struct Event : _ze_event_handle_t { size_t gpuStartTimestamp = 0u; size_t gpuEndTimestamp = 0u; + uint32_t maxKernelCount = 0; uint32_t kernelCount = 1u; + uint32_t maxPacketCount = 0; bool isTimestampEvent = false; bool usingContextEndOffset = false; @@ -286,6 +299,9 @@ struct EventPoolImp : public EventPool { void setEventSize(uint32_t size) override { eventSize = size; } void setEventAlignment(uint32_t alignment) override { eventAlignment = alignment; } size_t getNumEvents() { return numEvents; } + uint32_t getEventMaxPackets() { return eventPackets; } + size_t getEventPoolSize() const { return eventPoolSize; } + void initializeSizeParameters(uint32_t numDevices, ze_device_handle_t *deviceHandles, DriverHandleImp &driver, const NEO::HardwareInfo &hwInfo); Device *getDevice() override { return devices[0]; } @@ -297,8 +313,10 @@ struct EventPoolImp : public EventPool { bool isShareableEventMemory = false; protected: + size_t eventPoolSize = 0; uint32_t eventAlignment = 0; uint32_t eventSize = 0; + uint32_t eventPackets = 0; }; } // namespace L0 diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index 92568821f4..660331d54e 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -22,7 +22,15 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device * event->setEventTimestampFlag(true); } auto neoDevice = device->getNEODevice(); - event->kernelEventCompletionData = std::make_unique[]>(EventPacketsCount::maxKernelSplit); + auto &hwInfo = neoDevice->getHardwareInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + uint32_t maxKernels = EventPacketsCount::maxKernelSplit; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + maxKernels = l0HwHelper.getEventMaxKernelCount(hwInfo); + } + + event->kernelEventCompletionData = std::make_unique[]>(maxKernels); auto alloc = eventPool->getAllocation().getGraphicsAllocation(neoDevice->getRootDeviceIndex()); @@ -32,7 +40,9 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device * event->signalScope = desc->signal; event->waitScope = desc->wait; event->csr = neoDevice->getDefaultEngine().commandStreamReceiver; - bool useContextEndOffset = L0HwHelper::get(neoDevice->getHardwareInfo().platform.eRenderCoreFamily).multiTileCapablePlatform(); + event->maxKernelCount = maxKernels; + event->maxPacketCount = static_cast(eventPool)->getEventMaxPackets(); + bool useContextEndOffset = l0HwHelper.multiTileCapablePlatform(); int32_t overrideUseContextEndOffset = NEO::DebugManager.flags.UseContextEndOffsetForEventCompletion.get(); if (overrideUseContextEndOffset != -1) { useContextEndOffset = !!overrideUseContextEndOffset; @@ -293,7 +303,7 @@ ze_result_t EventImp::reset() { template void EventImp::resetDeviceCompletionData() { - this->kernelCount = EventPacketsCount::maxKernelSplit; + this->kernelCount = this->maxKernelCount; for (uint32_t i = 0; i < kernelCount; i++) { this->kernelEventCompletionData[i].setPacketsUsed(NEO::TimestampPacketSizeControl::preferredPacketCount); } diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper.cpp b/level_zero/core/source/hw_helpers/l0_hw_helper.cpp index 45fd65bd93..c189e634cc 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper.cpp +++ b/level_zero/core/source/hw_helpers/l0_hw_helper.cpp @@ -60,4 +60,11 @@ bool L0HwHelper::useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo) { return false; } +bool L0HwHelper::useDynamicEventPacketsCount(const NEO::HardwareInfo &hwInfo) { + if (NEO::DebugManager.flags.UseDynamicEventPacketsCount.get() != -1) { + return !!NEO::DebugManager.flags.UseDynamicEventPacketsCount.get(); + } + return false; +} + } // namespace L0 diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper.h b/level_zero/core/source/hw_helpers/l0_hw_helper.h index 3ad42fefed..2cd0db66a8 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper.h +++ b/level_zero/core/source/hw_helpers/l0_hw_helper.h @@ -36,6 +36,7 @@ class L0HwHelper { static bool enableImmediateCmdListHeapSharing(const NEO::HardwareInfo &hwInfo, bool cmdlistSupport); static bool usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo); static bool useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo); + static bool useDynamicEventPacketsCount(const NEO::HardwareInfo &hwInfo); virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0; virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0; @@ -53,6 +54,9 @@ class L0HwHelper { virtual bool platformSupportsFrontEndTracking(const NEO::HardwareInfo &hwInfo) const = 0; virtual bool platformSupportsPipelineSelectTracking(const NEO::HardwareInfo &hwInfo) const = 0; + virtual uint32_t getEventMaxKernelCount(const NEO::HardwareInfo &hwInfo) const = 0; + virtual uint32_t getEventBaseMaxPacketCount(const NEO::HardwareInfo &hwInfo) const = 0; + protected: L0HwHelper() = default; }; @@ -80,6 +84,9 @@ class L0HwHelperHw : public L0HwHelper { bool platformSupportsStateComputeModeTracking(const NEO::HardwareInfo &hwInfo) const override; bool platformSupportsFrontEndTracking(const NEO::HardwareInfo &hwInfo) const override; bool platformSupportsPipelineSelectTracking(const NEO::HardwareInfo &hwInfo) const override; + + uint32_t getEventMaxKernelCount(const NEO::HardwareInfo &hwInfo) const override; + uint32_t getEventBaseMaxPacketCount(const NEO::HardwareInfo &hwInfo) const override; }; } // namespace L0 diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper_pvc_and_later.inl b/level_zero/core/source/hw_helpers/l0_hw_helper_pvc_and_later.inl index 1275189001..eb6abaabfd 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper_pvc_and_later.inl +++ b/level_zero/core/source/hw_helpers/l0_hw_helper_pvc_and_later.inl @@ -38,24 +38,4 @@ void L0HwHelperHw::setAdditionalGroupProperty(ze_command_queue_group_pro } } -template -bool L0HwHelperHw::platformSupportsCmdListHeapSharing(const NEO::HardwareInfo &hwInfo) const { - return false; -} - -template -bool L0HwHelperHw::platformSupportsStateComputeModeTracking(const NEO::HardwareInfo &hwInfo) const { - return false; -} - -template -bool L0HwHelperHw::platformSupportsFrontEndTracking(const NEO::HardwareInfo &hwInfo) const { - return false; -} - -template -bool L0HwHelperHw::platformSupportsPipelineSelectTracking(const NEO::HardwareInfo &hwInfo) const { - return false; -} - } // namespace L0 diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper_skl_and_later.inl b/level_zero/core/source/hw_helpers/l0_hw_helper_skl_and_later.inl index 621ce917c9..619eb94a41 100644 --- a/level_zero/core/source/hw_helpers/l0_hw_helper_skl_and_later.inl +++ b/level_zero/core/source/hw_helpers/l0_hw_helper_skl_and_later.inl @@ -38,4 +38,14 @@ bool L0HwHelperHw::platformSupportsPipelineSelectTracking(const NEO::Har return false; } +template +uint32_t L0HwHelperHw::getEventMaxKernelCount(const NEO::HardwareInfo &hwInfo) const { + return 1; +} + +template +uint32_t L0HwHelperHw::getEventBaseMaxPacketCount(const NEO::HardwareInfo &hwInfo) const { + return 1u; +} + } // namespace L0 diff --git a/level_zero/core/source/hw_helpers/l0_hw_helper_xehp_and_later.inl b/level_zero/core/source/hw_helpers/l0_hw_helper_xehp_and_later.inl new file mode 100644 index 0000000000..a1aac70c19 --- /dev/null +++ b/level_zero/core/source/hw_helpers/l0_hw_helper_xehp_and_later.inl @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/hw_helper.h" + +#include "level_zero/core/source/hw_helpers/l0_hw_helper.h" + +namespace L0 { + +template +bool L0HwHelperHw::multiTileCapablePlatform() const { + return false; +} + +template +bool L0HwHelperHw::platformSupportsCmdListHeapSharing(const NEO::HardwareInfo &hwInfo) const { + return false; +} + +template +bool L0HwHelperHw::platformSupportsStateComputeModeTracking(const NEO::HardwareInfo &hwInfo) const { + return false; +} + +template +bool L0HwHelperHw::platformSupportsFrontEndTracking(const NEO::HardwareInfo &hwInfo) const { + return false; +} + +template +bool L0HwHelperHw::platformSupportsPipelineSelectTracking(const NEO::HardwareInfo &hwInfo) const { + return false; +} + +template +uint32_t L0HwHelperHw::getEventMaxKernelCount(const NEO::HardwareInfo &hwInfo) const { + uint32_t kernelCount = EventPacketsCount::maxKernelSplit; + if (L0HwHelper::usePipeControlMultiKernelEventSync(hwInfo)) { + kernelCount = 1; + } + return kernelCount; +} + +template +uint32_t L0HwHelperHw::getEventBaseMaxPacketCount(const NEO::HardwareInfo &hwInfo) const { + uint32_t basePackets = getEventMaxKernelCount(hwInfo); + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo)) { + basePackets += L0HwHelper::useCompactL3FlushEventPacket(hwInfo) ? 0 : 1; + } + + return basePackets; +} + +} // namespace L0 diff --git a/level_zero/core/source/xe_hp_core/l0_hw_helper_xe_hp_core.cpp b/level_zero/core/source/xe_hp_core/l0_hw_helper_xe_hp_core.cpp index bdf24d51eb..0f9c9cc34b 100644 --- a/level_zero/core/source/xe_hp_core/l0_hw_helper_xe_hp_core.cpp +++ b/level_zero/core/source/xe_hp_core/l0_hw_helper_xe_hp_core.cpp @@ -9,7 +9,7 @@ #include "level_zero/core/source/helpers/l0_populate_factory.h" #include "level_zero/core/source/hw_helpers/l0_hw_helper_base.inl" -#include "level_zero/core/source/hw_helpers/l0_hw_helper_skl_and_later.inl" +#include "level_zero/core/source/hw_helpers/l0_hw_helper_xehp_and_later.inl" namespace L0 { @@ -32,6 +32,10 @@ bool L0HwHelperHw::multiTileCapablePlatform() const { return true; } +template <> +void L0HwHelperHw::setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const { +} + template <> bool L0HwHelperHw::platformSupportsPipelineSelectTracking(const NEO::HardwareInfo &hwInfo) const { return true; diff --git a/level_zero/core/source/xe_hpc_core/l0_hw_helper_xe_hpc_core.cpp b/level_zero/core/source/xe_hpc_core/l0_hw_helper_xe_hpc_core.cpp index f63107f4a2..0dee969be6 100644 --- a/level_zero/core/source/xe_hpc_core/l0_hw_helper_xe_hpc_core.cpp +++ b/level_zero/core/source/xe_hpc_core/l0_hw_helper_xe_hpc_core.cpp @@ -10,6 +10,7 @@ #include "level_zero/core/source/helpers/l0_populate_factory.h" #include "level_zero/core/source/hw_helpers/l0_hw_helper_base.inl" #include "level_zero/core/source/hw_helpers/l0_hw_helper_pvc_and_later.inl" +#include "level_zero/core/source/hw_helpers/l0_hw_helper_xehp_and_later.inl" namespace L0 { diff --git a/level_zero/core/source/xe_hpg_core/l0_hw_helper_xe_hpg_core.cpp b/level_zero/core/source/xe_hpg_core/l0_hw_helper_xe_hpg_core.cpp index 0b552f03ed..11c39f9d3c 100644 --- a/level_zero/core/source/xe_hpg_core/l0_hw_helper_xe_hpg_core.cpp +++ b/level_zero/core/source/xe_hpg_core/l0_hw_helper_xe_hpg_core.cpp @@ -9,7 +9,7 @@ #include "level_zero/core/source/helpers/l0_populate_factory.h" #include "level_zero/core/source/hw_helpers/l0_hw_helper_base.inl" -#include "level_zero/core/source/hw_helpers/l0_hw_helper_skl_and_later.inl" +#include "level_zero/core/source/hw_helpers/l0_hw_helper_xehp_and_later.inl" namespace L0 { @@ -27,6 +27,10 @@ bool L0HwHelperHw::isResumeWARequired() { return true; } +template <> +void L0HwHelperHw::setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const { +} + template <> bool L0HwHelperHw::platformSupportsCmdListHeapSharing(const NEO::HardwareInfo &hwInfo) const { return true; diff --git a/level_zero/core/test/unit_tests/mocks/mock_device.h b/level_zero/core/test/unit_tests/mocks/mock_device.h index 7b1ce5bc42..e6dafa482a 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_device.h +++ b/level_zero/core/test/unit_tests/mocks/mock_device.h @@ -83,6 +83,7 @@ struct Mock : public Device { ADDMETHOD_NOBASE(obtainReusableAllocation, NEO::GraphicsAllocation *, nullptr, (size_t requiredSize, NEO::AllocationType type)) ADDMETHOD_NOBASE_VOIDRETURN(storeReusableAllocation, (NEO::GraphicsAllocation & alloc)); ADDMETHOD_NOBASE(getFabricVertex, ze_result_t, ZE_RESULT_SUCCESS, (ze_fabric_vertex_handle_t * phVertex)); + ADDMETHOD_CONST_NOBASE(getEventMaxPacketCount, uint32_t, 8, ()) DebugSession *createDebugSession(const zet_debug_config_t &config, ze_result_t &result, bool isRootAttach) override { result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/level_zero/core/test/unit_tests/mocks/mock_driver_handle.h b/level_zero/core/test/unit_tests/mocks/mock_driver_handle.h index f899737cdc..b97b0ce60c 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_driver_handle.h +++ b/level_zero/core/test/unit_tests/mocks/mock_driver_handle.h @@ -35,6 +35,7 @@ struct Mock : public DriverHandleImp { ADDMETHOD_NOBASE(releaseImportedPointer, ze_result_t, ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, (void *ptr)) ADDMETHOD_NOBASE(getHostPointerBaseAddress, ze_result_t, ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, (void *ptr, void **baseAddress)) ADDMETHOD_NOBASE(findHostPointerAllocation, NEO::GraphicsAllocation *, nullptr, (void *ptr, size_t size, uint32_t rootDeviceIndex)) + ADDMETHOD_CONST_NOBASE(getEventMaxPacketCount, uint32_t, 8, (uint32_t, ze_device_handle_t *)) void setupDevices(std::vector> devices); diff --git a/level_zero/core/test/unit_tests/mocks/mock_event.h b/level_zero/core/test/unit_tests/mocks/mock_event.h index 33dfaa94ee..00d417a7cb 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_event.h +++ b/level_zero/core/test/unit_tests/mocks/mock_event.h @@ -22,6 +22,7 @@ struct WhiteBox<::L0::Event> : public ::L0::Event { using BaseClass::csr; using BaseClass::hostAddress; using BaseClass::l3FlushAppliedOnKernel; + using BaseClass::maxKernelCount; }; using Event = WhiteBox<::L0::Event>; @@ -71,6 +72,7 @@ class MockEvent : public ::L0::Event { using ::L0::Event::gpuStartTimestamp; using ::L0::Event::isCompleted; using ::L0::Event::l3FlushAppliedOnKernel; + using ::L0::Event::maxKernelCount; MockEvent() { mockAllocation.reset(new NEO::MockGraphicsAllocation(0, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp index e40ac75652..001e9ae0bb 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp @@ -11,6 +11,7 @@ #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h" +#include "level_zero/core/source/hw_helpers/l0_hw_helper.h" #include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" @@ -91,16 +92,29 @@ HWTEST_F(CommandListAppendEventReset, givenCmdlistWhenResetEventWithTimeStampIsA gpuAddress += event->getContextEndOffset(); } - auto itorSdi = findAll(cmdList.begin(), cmdList.end()); - uint32_t sdiFound = 0; - ASSERT_NE(0u, itorSdi.size()); - for (auto it : itorSdi) { - auto cmd = genCmdCast(*it); - EXPECT_EQ(gpuAddress, cmd->getAddress()); - gpuAddress += event->getSinglePacketSize(); - sdiFound++; + auto &hwInfo = device->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + uint32_t maxPackets = EventPacketsCount::eventPackets; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + maxPackets = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + } + + auto itorSdi = findAll(cmdList.begin(), cmdList.end()); + + if (maxPackets == 1) { + EXPECT_EQ(0u, itorSdi.size()); + } else { + uint32_t sdiFound = 0; + ASSERT_NE(0u, itorSdi.size()); + for (auto it : itorSdi) { + auto cmd = genCmdCast(*it); + EXPECT_EQ(gpuAddress, cmd->getAddress()); + gpuAddress += event->getSinglePacketSize(); + sdiFound++; + } + EXPECT_EQ(EventPacketsCount::eventPackets - 1, sdiFound); } - EXPECT_EQ(EventPacketsCount::eventPackets - 1, sdiFound); uint32_t postSyncFound = 0; for (auto it : itorPC) { @@ -216,6 +230,9 @@ HWTEST2_F(CommandListAppendEventReset, givenTimestampEventUsedInResetThenPipeCon using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; auto &commandContainer = commandList->commandContainer; + auto &hwInfo = device->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + ze_event_pool_desc_t eventPoolDesc = {}; eventPoolDesc.count = 1; eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; @@ -234,7 +251,12 @@ HWTEST2_F(CommandListAppendEventReset, givenTimestampEventUsedInResetThenPipeCon auto contextOffset = event->getContextEndOffset(); auto baseAddr = event->getGpuAddress(device); auto gpuAddress = ptrOffset(baseAddr, contextOffset); - gpuAddress += ((EventPacketsCount::eventPackets - 1) * event->getSinglePacketSize()); + + uint32_t maxPackets = EventPacketsCount::eventPackets; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + maxPackets = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + } + gpuAddress += ((maxPackets - 1) * event->getSinglePacketSize()); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp index bcda2f4096..8dae487f43 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp @@ -217,6 +217,8 @@ HWTEST_F(CommandListAppendWaitOnEvent, WhenAppendingWaitOnTimestampEventWithThre EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + event->setMaxKernelCount(3u); + event->setPacketsInUse(3u); event->increaseKernelCount(); event->setPacketsInUse(3u); diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index d159e1a99f..492397ed2e 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -158,10 +158,18 @@ HWTEST_F(EventPoolCreate, givenTimestampEventsThenEventSizeSufficientForAllKerne std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); EXPECT_EQ(ZE_RESULT_SUCCESS, result); ASSERT_NE(nullptr, eventPool); - uint32_t maxKernelSplit = 3; - uint32_t packetsSize = maxKernelSplit * NEO::TimestampPacketSizeControl::preferredPacketCount * + + auto &hwInfo = device->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + uint32_t maxPacketCount = EventPacketsCount::maxKernelSplit * NEO::TimestampPacketSizeControl::preferredPacketCount; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + maxPacketCount = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + } + uint32_t packetsSize = maxPacketCount * static_cast(NEO::TimestampPackets::getSinglePacketSize()); - uint32_t kernelTimestampsSize = static_cast(alignUp(packetsSize, 4 * MemoryConstants::cacheLineSize)); + uint32_t kernelTimestampsSize = static_cast(alignUp(packetsSize, hwHelper.getTimestampPacketAllocatorAlignment())); EXPECT_EQ(kernelTimestampsSize, eventPool->getEventSize()); } @@ -661,7 +669,8 @@ TEST_F(EventCreate, givenEventWhenSignaledAndResetFromTheHostThenCorrectDataAndO EXPECT_EQ(ZE_RESULT_SUCCESS, result); ASSERT_NE(nullptr, eventPool); - auto &l0HwHelper = L0HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily); + auto &hwInfo = device->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); auto event = std::unique_ptr(l0HwHelper.createEvent(eventPool.get(), &eventDesc, device)); ASSERT_NE(nullptr, event); @@ -676,6 +685,10 @@ TEST_F(EventCreate, givenEventWhenSignaledAndResetFromTheHostThenCorrectDataAndO eventCompletionMemory = ptrOffset(eventCompletionMemory, event->getContextEndOffset()); } uint32_t maxPacketsCount = EventPacketsCount::maxKernelSplit * NEO::TimestampPacketSizeControl::preferredPacketCount; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + maxPacketsCount = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + } + for (uint32_t i = 0; i < maxPacketsCount; i++) { EXPECT_EQ(Event::STATE_INITIAL, *eventCompletionMemory); eventCompletionMemory = ptrOffset(eventCompletionMemory, event->getSinglePacketSize()); @@ -1142,9 +1155,9 @@ struct EventCreateAllocationResidencyTest : public ::testing::Test { L0::Device *device = nullptr; }; -class TimestampEventCreate : public Test { +class TimestampEventCreateFixture : public DeviceFixture { public: - void SetUp() override { + void setUp() { DeviceFixture::setUp(); ze_event_pool_desc_t eventPoolDesc = {}; eventPoolDesc.count = 1; @@ -1163,7 +1176,7 @@ class TimestampEventCreate : public Test { ASSERT_NE(nullptr, event); } - void TearDown() override { + void tearDown() { event.reset(nullptr); eventPool.reset(nullptr); DeviceFixture::tearDown(); @@ -1173,13 +1186,33 @@ class TimestampEventCreate : public Test { std::unique_ptr> event; }; +struct TimestampEventCreateMultiKernelFixture : public TimestampEventCreateFixture { + void setUp() { + DebugManager.flags.UsePipeControlMultiKernelEventSync.set(0); + TimestampEventCreateFixture::setUp(); + } + + DebugManagerStateRestore restorer; +}; + +using TimestampEventCreate = Test; +using TimestampEventCreateMultiKernel = Test; + TEST_F(TimestampEventCreate, givenEventCreatedWithTimestampThenIsTimestampEventFlagSet) { EXPECT_TRUE(event->isEventTimestampFlagSet()); } TEST_F(TimestampEventCreate, givenEventTimestampsCreatedWhenResetIsInvokeThenCorrectDataAreSet) { + auto &hwInfo = device->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + uint32_t maxKernelCount = EventPacketsCount::maxKernelSplit; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + maxKernelCount = l0HwHelper.getEventMaxKernelCount(hwInfo); + } + EXPECT_NE(nullptr, event->kernelEventCompletionData); - for (auto j = 0u; j < EventPacketsCount::maxKernelSplit; j++) { + for (auto j = 0u; j < maxKernelCount; j++) { for (auto i = 0u; i < NEO::TimestampPacketSizeControl::preferredPacketCount; i++) { EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelEventCompletionData[j].getContextStartValue(i)); EXPECT_EQ(static_cast(Event::State::STATE_INITIAL), event->kernelEventCompletionData[j].getGlobalStartValue(i)); @@ -1212,7 +1245,7 @@ TEST_F(TimestampEventCreate, givenTimestampEventThenAllocationsIsDependentIfAllo } } -TEST_F(TimestampEventCreate, givenEventTimestampWhenPacketCountIsSetThenCorrectOffsetIsReturned) { +HWTEST2_F(TimestampEventCreateMultiKernel, givenEventTimestampWhenPacketCountIsSetThenCorrectOffsetIsReturned, IsAtLeastXeHpCore) { EXPECT_EQ(1u, event->getPacketsInUse()); auto gpuAddr = event->getGpuAddress(device); EXPECT_EQ(gpuAddr, event->getPacketAddress(device)); @@ -1242,7 +1275,7 @@ TEST_F(TimestampEventCreate, givenEventWhenSignaledAndResetFromTheHostThenCorrec event->reset(); result = event->queryStatus(); EXPECT_EQ(ZE_RESULT_NOT_READY, result); - for (auto j = 0u; j < EventPacketsCount::maxKernelSplit; j++) { + for (auto j = 0u; j < event->getKernelCount(); j++) { for (auto i = 0u; i < NEO::TimestampPacketSizeControl::preferredPacketCount; i++) { EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelEventCompletionData[j].getContextStartValue(i)); EXPECT_EQ(Event::State::STATE_INITIAL, event->kernelEventCompletionData[j].getGlobalStartValue(i)); @@ -1446,8 +1479,9 @@ TEST_F(TimestampEventCreate, givenEventWhenQueryingTimestampExpThenCorrectDataSe } } -TEST_F(TimestampEventCreate, givenTimeStampEventUsedOnTwoKernelsWhenL3FlushSetOnFirstKernelThenDoNotUseSecondPacketOfFirstKernel) { +HWTEST2_F(TimestampEventCreateMultiKernel, givenTimeStampEventUsedOnTwoKernelsWhenL3FlushSetOnFirstKernelThenDoNotUseSecondPacketOfFirstKernel, IsAtLeastXeHpCore) { typename MockTimestampPackets32::Packet packetData[4]; + event->hostAddress = packetData; constexpr uint32_t kernelStartValue = 5u; @@ -1489,8 +1523,9 @@ TEST_F(TimestampEventCreate, givenTimeStampEventUsedOnTwoKernelsWhenL3FlushSetOn EXPECT_EQ(static_cast(kernelEndValue), results.global.kernelEnd); } -TEST_F(TimestampEventCreate, givenTimeStampEventUsedOnTwoKernelsWhenL3FlushSetOnSecondKernelThenDoNotUseSecondPacketOfSecondKernel) { +HWTEST2_F(TimestampEventCreateMultiKernel, givenTimeStampEventUsedOnTwoKernelsWhenL3FlushSetOnSecondKernelThenDoNotUseSecondPacketOfSecondKernel, IsAtLeastXeHpCore) { typename MockTimestampPackets32::Packet packetData[4]; + event->hostAddress = packetData; constexpr uint32_t kernelStartValue = 5u; @@ -1532,7 +1567,7 @@ TEST_F(TimestampEventCreate, givenTimeStampEventUsedOnTwoKernelsWhenL3FlushSetOn EXPECT_EQ(static_cast(kernelEndValue), results.global.kernelEnd); } -TEST_F(TimestampEventCreate, givenOverflowingTimeStampDataOnTwoKernelsWhenQueryKernelTimestampIsCalledOverflowIsObserved) { +HWTEST2_F(TimestampEventCreateMultiKernel, givenOverflowingTimeStampDataOnTwoKernelsWhenQueryKernelTimestampIsCalledOverflowIsObserved, IsAtLeastXeHpCore) { typename MockTimestampPackets32::Packet packetData[4] = {}; event->hostAddress = packetData; @@ -1993,10 +2028,12 @@ TEST_F(EventTests, givenEventUseMultiplePacketsWhenHostSignalThenExpectAllPacket } } -TEST_F(EventTests, WhenSettingL3FlushOnEventThenSetOnParticularKernel) { +HWTEST2_F(EventTests, WhenSettingL3FlushOnEventThenSetOnParticularKernel, IsAtLeastXeHpCore) { + DebugManagerStateRestore restorer; + DebugManager.flags.UsePipeControlMultiKernelEventSync.set(0); + auto event = whiteboxCast(Event::create(eventPool, &eventDesc, device)); ASSERT_NE(event, nullptr); - EXPECT_FALSE(event->getL3FlushForCurrenKernel()); event->setL3FlushForCurrentKernel(); @@ -2071,10 +2108,18 @@ HWTEST_F(EventSizeTests, whenCreatingEventPoolThenUseCorrectSizeAndAlignment) { eventPool.reset(static_cast(EventPool::create(device->getDriverHandle(), context, 1, &hDevice, &eventPoolDesc, result))); EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto &hwHelper = device->getHwHelper(); + auto &hwInfo = device->getHwInfo(); + + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + uint32_t packetCount = EventPacketsCount::eventPackets; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + packetCount = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + } auto expectedAlignment = static_cast(hwHelper.getTimestampPacketAllocatorAlignment()); auto singlePacketSize = TimestampPackets::getSinglePacketSize(); - auto expectedSize = static_cast(alignUp(EventPacketsCount::eventPackets * singlePacketSize, expectedAlignment)); + auto expectedSize = static_cast(alignUp(packetCount * singlePacketSize, expectedAlignment)); EXPECT_EQ(expectedSize, eventPool->getEventSize()); @@ -2097,10 +2142,17 @@ HWTEST_F(EventSizeTests, whenCreatingEventPoolThenUseCorrectSizeAndAlignment) { } HWTEST_F(EventSizeTests, givenDebugFlagwhenCreatingEventPoolThenUseCorrectSizeAndAlignment) { - auto &hwHelper = device->getHwHelper(); + auto &hwInfo = device->getHwInfo(); auto expectedAlignment = static_cast(hwHelper.getTimestampPacketAllocatorAlignment()); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + uint32_t packetCount = EventPacketsCount::eventPackets; + if (l0HwHelper.useDynamicEventPacketsCount(hwInfo)) { + packetCount = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + } + { DebugManager.flags.OverrideTimestampPacketSize.set(4); @@ -2109,7 +2161,7 @@ HWTEST_F(EventSizeTests, givenDebugFlagwhenCreatingEventPoolThenUseCorrectSizeAn EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto singlePacketSize = TimestampPackets::getSinglePacketSize(); - auto expectedSize = static_cast(alignUp(EventPacketsCount::eventPackets * singlePacketSize, expectedAlignment)); + auto expectedSize = static_cast(alignUp(packetCount * singlePacketSize, expectedAlignment)); EXPECT_EQ(expectedSize, eventPool->getEventSize()); @@ -2130,7 +2182,7 @@ HWTEST_F(EventSizeTests, givenDebugFlagwhenCreatingEventPoolThenUseCorrectSizeAn EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto singlePacketSize = TimestampPackets::getSinglePacketSize(); - auto expectedSize = static_cast(alignUp(EventPacketsCount::eventPackets * singlePacketSize, expectedAlignment)); + auto expectedSize = static_cast(alignUp(packetCount * singlePacketSize, expectedAlignment)); EXPECT_EQ(expectedSize, eventPool->getEventSize()); @@ -2285,5 +2337,134 @@ TEST_F(EventSynchronizeTest, whenEventSetCsrThenCorrectCsrSet) { EXPECT_EQ(event->csr, defaultCsr); } +template +struct EventDynamicPacketUseFixture : public DeviceFixture { + void setUp() { + NEO::DebugManager.flags.UseDynamicEventPacketsCount.set(1); + if (multiTile == 1) { + DebugManager.flags.CreateMultipleSubDevices.set(2); + DebugManager.flags.EnableImplicitScaling.set(1); + } + DeviceFixture::setUp(); + } + + void testAllDevices() { + auto &hwInfo = device->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + ze_event_pool_desc_t eventPoolDesc = { + ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + nullptr, + 0, + 1}; + + ze_result_t result = ZE_RESULT_SUCCESS; + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + ASSERT_NE(nullptr, eventPool); + + auto eventPoolMaxPackets = static_cast(eventPool.get())->getEventMaxPackets(); + auto expectedPoolMaxPackets = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + if (multiTile == 1) { + expectedPoolMaxPackets *= 2; + } + EXPECT_EQ(expectedPoolMaxPackets, eventPoolMaxPackets); + + auto eventSize = eventPool->getEventSize(); + auto expectedEventSize = static_cast(alignUp(expectedPoolMaxPackets * hwHelper.getSingleTimestampPacketSize(), hwHelper.getTimestampPacketAllocatorAlignment())); + EXPECT_EQ(expectedEventSize, eventSize); + + ze_event_desc_t eventDesc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, + nullptr, + 0, + ZE_EVENT_SCOPE_FLAG_DEVICE, + ZE_EVENT_SCOPE_FLAG_DEVICE}; + + std::unique_ptr event(Event::create(eventPool.get(), &eventDesc, device)); + + EXPECT_EQ(expectedPoolMaxPackets, event->getMaxPacketsCount()); + + uint32_t maxKernels = l0HwHelper.getEventMaxKernelCount(hwInfo); + EXPECT_EQ(maxKernels, event->getMaxKernelCount()); + } + + void testSingleDevice() { + ze_result_t result = ZE_RESULT_SUCCESS; + + auto &hwInfo = device->getHwInfo(); + auto &l0HwHelper = L0HwHelper::get(hwInfo.platform.eRenderCoreFamily); + auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); + + ze_event_pool_desc_t eventPoolDesc = { + ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + nullptr, + 0, + 1}; + + std::vector deviceHandles; + + L0::Device *eventDevice = device; + if (multiTile == 1) { + uint32_t count = 2; + ze_device_handle_t subDevices[2]; + result = device->getSubDevices(&count, subDevices); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + deviceHandles.push_back(subDevices[0]); + eventDevice = Device::fromHandle(subDevices[0]); + } else { + deviceHandles.push_back(device->toHandle()); + } + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 1, deviceHandles.data(), &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + ASSERT_NE(nullptr, eventPool); + + auto eventPoolMaxPackets = static_cast(eventPool.get())->getEventMaxPackets(); + auto expectedPoolMaxPackets = l0HwHelper.getEventBaseMaxPacketCount(hwInfo); + + EXPECT_EQ(expectedPoolMaxPackets, eventPoolMaxPackets); + + auto eventSize = eventPool->getEventSize(); + auto expectedEventSize = static_cast(alignUp(expectedPoolMaxPackets * hwHelper.getSingleTimestampPacketSize(), hwHelper.getTimestampPacketAllocatorAlignment())); + EXPECT_EQ(expectedEventSize, eventSize); + + ze_event_desc_t eventDesc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, + nullptr, + 0, + ZE_EVENT_SCOPE_FLAG_DEVICE, + ZE_EVENT_SCOPE_FLAG_DEVICE}; + + std::unique_ptr event(Event::create(eventPool.get(), &eventDesc, eventDevice)); + + EXPECT_EQ(expectedPoolMaxPackets, event->getMaxPacketsCount()); + + uint32_t maxKernels = l0HwHelper.getEventMaxKernelCount(hwInfo); + EXPECT_EQ(maxKernels, event->getMaxKernelCount()); + } + + DebugManagerStateRestore restorer; +}; + +using EventDynamicPacketUseTest = Test>; +HWTEST2_F(EventDynamicPacketUseTest, testAllDevices, IsAtLeastSkl) { + testAllDevices(); +} + +HWTEST2_F(EventDynamicPacketUseTest, testSingleDevice, IsAtLeastSkl) { + testSingleDevice(); +} + +using EventMultiTileDynamicPacketUseTest = Test>; +HWTEST2_F(EventMultiTileDynamicPacketUseTest, testAllDevices, IsAtLeastXeHpCore) { + testAllDevices(); +} + +HWTEST2_F(EventMultiTileDynamicPacketUseTest, testSingleDevice, IsAtLeastXeHpCore) { + testSingleDevice(); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp b/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp index 5a09f0f12e..716919c0fb 100644 --- a/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp +++ b/level_zero/core/test/unit_tests/sources/helper/l0_hw_helper_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/ptr_math.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/default_hw_info.h" @@ -641,5 +642,84 @@ TEST_F(L0HwHelperTest, givenL0HelperWhenGettingDefaultValueForCompactL3FlushEven EXPECT_FALSE(defaultValue); } +TEST_F(L0HwHelperTest, givenL0HelperWhenGettingDefaultValueForDynamicEventPacketCountThenReturnFalse) { + auto hwInfo = *NEO::defaultHwInfo.get(); + bool defaultValue = L0::L0HwHelper::useDynamicEventPacketsCount(hwInfo); + EXPECT_FALSE(defaultValue); +} + +HWTEST2_F(L0HwHelperTest, givenL0HelperWhenGettingMaxKernelAndMaxPacketThenExpectBothReturnOne, NonMultiTilePlatforms) { + auto hwInfo = *NEO::defaultHwInfo.get(); + EXPECT_EQ(1u, L0::L0HwHelperHw::get().getEventMaxKernelCount(hwInfo)); + EXPECT_EQ(1u, L0::L0HwHelperHw::get().getEventBaseMaxPacketCount(hwInfo)); +} + +template +struct L0HwHelperMultiPacketEventFixture { + void setUp() { + DebugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); + DebugManager.flags.CompactL3FlushEventPacket.set(compactL3FlushEventPacket); + } + + void tearDown() { + } + + DebugManagerStateRestore restorer; +}; + +using L0HwHelperEventMultiKernelEnabledL3FlushCompactDisabledTest = Test>; +HWTEST2_F(L0HwHelperEventMultiKernelEnabledL3FlushCompactDisabledTest, + givenL0HelperWhenGettingMaxKernelAndMaxPacketThenExpectKernelThreeAndPacketThreeWithL3PacketWhenApplicable, + IsAtLeastXeHpCore) { + auto hwInfo = *NEO::defaultHwInfo.get(); + + uint32_t expectedPacket = 3; + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo)) { + expectedPacket++; + } + + EXPECT_EQ(3u, L0::L0HwHelperHw::get().getEventMaxKernelCount(hwInfo)); + EXPECT_EQ(expectedPacket, L0::L0HwHelperHw::get().getEventBaseMaxPacketCount(hwInfo)); +} + +using L0HwHelperEventMultiKernelEnabledL3FlushCompactEnabledTest = Test>; +HWTEST2_F(L0HwHelperEventMultiKernelEnabledL3FlushCompactEnabledTest, + givenL0HelperWhenGettingMaxKernelAndMaxPacketThenExpectKernelThreeAndPacketThree, + IsAtLeastXeHpCore) { + auto hwInfo = *NEO::defaultHwInfo.get(); + + uint32_t expectedPacket = 3; + + EXPECT_EQ(3u, L0::L0HwHelperHw::get().getEventMaxKernelCount(hwInfo)); + EXPECT_EQ(expectedPacket, L0::L0HwHelperHw::get().getEventBaseMaxPacketCount(hwInfo)); +} + +using L0HwHelperEventMultiKernelDisabledL3FlushCompactDisabledTest = Test>; +HWTEST2_F(L0HwHelperEventMultiKernelDisabledL3FlushCompactDisabledTest, + givenL0HelperWhenGettingMaxKernelAndMaxPacketThenExpectKernelOneAndPacketOneWithL3PacketWhenApplicable, + IsAtLeastXeHpCore) { + auto hwInfo = *NEO::defaultHwInfo.get(); + + uint32_t expectedPacket = 1; + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo)) { + expectedPacket++; + } + + EXPECT_EQ(1u, L0::L0HwHelperHw::get().getEventMaxKernelCount(hwInfo)); + EXPECT_EQ(expectedPacket, L0::L0HwHelperHw::get().getEventBaseMaxPacketCount(hwInfo)); +} + +using L0HwHelperEventMultiKernelDisabledL3FlushCompactEnabledTest = Test>; +HWTEST2_F(L0HwHelperEventMultiKernelDisabledL3FlushCompactEnabledTest, + givenL0HelperWhenGettingMaxKernelAndMaxPacketThenExpectKernelOneAndPacketOne, + IsAtLeastXeHpCore) { + auto hwInfo = *NEO::defaultHwInfo.get(); + + uint32_t expectedPacket = 1; + + EXPECT_EQ(1u, L0::L0HwHelperHw::get().getEventMaxKernelCount(hwInfo)); + EXPECT_EQ(expectedPacket, L0::L0HwHelperHw::get().getEventBaseMaxPacketCount(hwInfo)); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp b/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp index 073a0fc190..7f3d02cd36 100644 --- a/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hp_core/test_cmdlist_xe_hp_core.cpp @@ -144,7 +144,17 @@ HWTEST2_F(CommandListAppendLaunchKernelWithAtomics, givenKernelWithGlobalAtomics EXPECT_FALSE(pCommandList->commandContainer.lastSentUseGlobalAtomics); } -using MultTileCommandListAppendLaunchKernelL3Flush = Test>; +struct MultTileCommandListAppendLaunchKernelL3FlushFixture : public MultiTileCommandListFixture { + using BaseClass = MultiTileCommandListFixture; + void setUp() { + DebugManager.flags.CompactL3FlushEventPacket.set(0); + BaseClass::setUp(); + } + + DebugManagerStateRestore restorer; +}; + +using MultTileCommandListAppendLaunchKernelL3Flush = Test; HWTEST2_F(MultTileCommandListAppendLaunchKernelL3Flush, givenKernelWithRegularEventAndWithWalkerPartitionThenProperCommandsEncoded, IsXeHpCore) { using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; @@ -262,7 +272,17 @@ HWTEST2_F(MultTileCommandListAppendLaunchKernelL3Flush, givenKernelWithTimestamp ASSERT_LE(1u, postSyncCount); } -using CommandListAppendLaunchKernelL3Flush = Test; +struct CommandListAppendLaunchKernelL3FlushFixture : public ModuleFixture { + void setUp() { + DebugManager.flags.CompactL3FlushEventPacket.set(0); + ModuleFixture::setUp(); + } + + DebugManagerStateRestore restorer; +}; + +using CommandListAppendLaunchKernelL3Flush = Test; + HWTEST2_F(CommandListAppendLaunchKernelL3Flush, givenKernelWithEventAndWithoutWalkerPartitionThenProperCommandsEncoded, IsXeHpCore) { using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; @@ -709,6 +729,9 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenEventWhenInvokingAppendLaunchKerne using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + DebugManagerStateRestore restorer; + DebugManager.flags.CompactL3FlushEventPacket.set(0); + createKernel(); ze_result_t returnValue; std::unique_ptr commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); diff --git a/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp b/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp index c1dc3829eb..ff1a195271 100644 --- a/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp @@ -408,6 +408,9 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpgCore, givenEventWhenAppendKernelIsCa using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + DebugManagerStateRestore restorer; + DebugManager.flags.CompactL3FlushEventPacket.set(0); + Mock<::L0::Kernel> kernel; auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); kernel.module = pMockModule.get(); diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index df0a409972..67cf17cdbc 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -420,6 +420,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableFlushTaskSubmission, -1, "Driver uses csr DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateCmdListHeapSharing, -1, "Immediate command lists using flush task use current csr heap instead private cmd list heap, -1:default (disabled), 0:disabled, 1:enabled") DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlMultiKernelEventSync, -1, "Use single PIPE_CONTROL for event signal of multi-kernel append operations instead multi-packet POSTSYNC_DATA from each COMPUTE_WALKER, -1: default , 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, CompactL3FlushEventPacket, -1, "Compact COMPUTE_WALKER event packet and L3 Flush signal packet into single event packet, -1: default , 0: disabled, 1: enabled") +DECLARE_DEBUG_VARIABLE(int32_t, UseDynamicEventPacketsCount, -1, "Use dynamic estimation for event packet count based on a given device configuration, -1: default , 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCSSWCONTROL MMIO. -1: default, 0: disabled, 1: if src in system mem, 2: if dst in system mem, 3: if src and dst in system mem, 4: always") /* IMPLICIT SCALING */ diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 953e619933..7278ab1221 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -434,6 +434,7 @@ UseTileMemoryBankInVirtualMemoryCreation = -1 DisableScratchPages = 0 ForceAllResourcesUncached = 0 ForcePreParserEnabledForMiArbCheck = -1 +UseDynamicEventPacketsCount = -1 BatchBufferStartPrepatchingWaEnabled = -1 SetVmAdviseAtomicAttribute = -1 DirectSubmissionForceLocalMemoryStorageMode = -1