From b21fef96cbe4ea158fe064d5035c40c0683dbede Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Thu, 6 Oct 2022 12:21:52 +0000 Subject: [PATCH] Wait for events from host Signed-off-by: Lukasz Jobczyk --- level_zero/core/source/cmdlist/cmdlist.h | 1 + .../source/cmdlist/cmdlist_hw_immediate.h | 1 + .../source/cmdlist/cmdlist_hw_immediate.inl | 38 +++++- .../core/source/cmdlist/cmdlist_imp.cpp | 3 + level_zero/core/source/cmdqueue/cmdqueue.cpp | 2 + .../sources/cmdlist/test_cmdlist_7.cpp | 124 ++++++++++++++++++ .../test/unit_test/linux/main_linux_dll.cpp | 5 + .../command_stream/command_stream_receiver.h | 12 ++ .../debug_settings/debug_variables_base.inl | 3 + .../os_interface/linux/sys_calls_linux.cpp | 6 + shared/source/os_interface/sys_calls_common.h | 4 +- .../source/os_interface/windows/sys_calls.cpp | 6 +- .../linux/sys_calls_linux_ult.cpp | 4 + .../common/os_interface/windows/sys_calls.cpp | 6 +- shared/test/common/test_files/igdrcl.config | 3 + .../command_stream_receiver_tests.cpp | 17 +++ 16 files changed, 231 insertions(+), 4 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index ae779883af..fda5b7a73c 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -290,6 +290,7 @@ struct CommandList : _ze_command_list_handle_t { NEO::CommandStreamReceiver *csr = nullptr; Device *device = nullptr; NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial; + unsigned long numThreads = 1u; uint32_t cmdListType = CommandListType::TYPE_REGULAR; uint32_t commandListPerThreadScratchSize = 0u; uint32_t commandListPerThreadPrivateScratchSize = 0u; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index b831941833..80d7ff5a7b 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -138,6 +138,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily barrierCalled{false}; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index d6f10e1cf1..c259f5586e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -206,6 +206,35 @@ ze_result_t CommandListCoreFamilyImmediate::executeCommandListImm return ZE_RESULT_SUCCESS; } +template +bool CommandListCoreFamilyImmediate::waitForEventsFromHost() { + bool waitForEventsFromHostEnabled = false; + if (NEO::DebugManager.flags.EventWaitOnHost.get() != -1) { + waitForEventsFromHostEnabled = NEO::DebugManager.flags.EventWaitOnHost.get(); + } + if (!waitForEventsFromHostEnabled) { + return false; + } + + auto numClients = static_cast(this->cmdQImmediate)->getCsr()->getNumClients(); + auto numClientsLimit = 2u; + if (NEO::DebugManager.flags.EventWaitOnHostNumClients.get() != -1) { + numClientsLimit = NEO::DebugManager.flags.EventWaitOnHostNumClients.get(); + } + if (numClients < numClientsLimit) { + return false; + }; + auto numThreadsLimit = 2u; + if (NEO::DebugManager.flags.EventWaitOnHostNumThreads.get() != -1) { + numThreadsLimit = NEO::DebugManager.flags.EventWaitOnHostNumThreads.get(); + } + if (this->numThreads < numThreadsLimit) { + return false; + } + + return true; +} + template ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( ze_kernel_handle_t kernelHandle, const ze_group_count_t *threadGroupDimensions, @@ -215,7 +244,14 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( if (this->isFlushTaskSubmissionEnabled) { checkAvailableSpace(); } - + if (waitForEventsFromHost()) { + for (uint32_t i = 0; i < numWaitEvents; i++) { + auto event = Event::fromHandle(phWaitEvents[i]); + event->hostSynchronize(std::numeric_limits::max()); + } + numWaitEvents = 0u; + phWaitEvents = nullptr; + } auto ret = CommandListCoreFamily::appendLaunchKernel(kernelHandle, threadGroupDimensions, hSignalEvent, numWaitEvents, phWaitEvents, launchParams); diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.cpp b/level_zero/core/source/cmdlist/cmdlist_imp.cpp index 899ea9086b..3e26f6ee67 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.cpp +++ b/level_zero/core/source/cmdlist/cmdlist_imp.cpp @@ -13,6 +13,7 @@ #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/logical_state_helper.h" #include "shared/source/indirect_heap/indirect_heap.h" +#include "shared/source/os_interface/sys_calls_common.h" #include "level_zero/core/source/cmdqueue/cmdqueue.h" #include "level_zero/core/source/device/device.h" @@ -158,6 +159,8 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device commandList->isBcsSplitNeeded = deviceImp->bcsSplit.setupDevice(productFamily, internalUsage, desc, csr); commandList->commandContainer.setImmediateCmdListCsr(csr); + commandList->numThreads = NEO::SysCalls::getNumThreads(); + return commandList; } diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index d1b256dfe1..9ff5c9212c 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -49,6 +49,7 @@ CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr } ze_result_t CommandQueueImp::destroy() { + this->csr->unregisterClient(); delete this; return ZE_RESULT_SUCCESS; } @@ -197,6 +198,7 @@ CommandQueue *CommandQueue::create(uint32_t productFamily, Device *device, NEO:: } osContext.ensureContextInitialized(); csr->initDirectSubmission(); + csr->registerClient(); return commandQueue; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp index 867417eb45..d756dd960b 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/os_interface/hw_info_config.h" +#include "shared/source/os_interface/sys_calls_common.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" @@ -521,6 +522,129 @@ HWTEST2_F(CmdlistAppendLaunchKernelTests, EXPECT_EQ(scratchPerThreadSize, ultCsr->requiredScratchSize); } +HWTEST2_F(CmdlistAppendLaunchKernelTests, + givenEventWaitOnHostWhenAppendLaunchKernelWithEventWaitListThenHostSynchronize, IsAtLeastXeHpCore) { + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EventWaitOnHost.set(1); + NEO::DebugManager.flags.EventWaitOnHostNumClients.set(0); + NEO::DebugManager.flags.EventWaitOnHostNumThreads.set(0); + + constexpr uint32_t scratchPerThreadSize = 0x200; + constexpr uint32_t privateScratchPerThreadSize = 0x100; + + std::unique_ptr mockKernelImmData = std::make_unique(0u); + auto kernelDescriptor = mockKernelImmData->kernelDescriptor; + kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false; + kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = scratchPerThreadSize; + kernelDescriptor->kernelAttributes.perThreadScratchSize[1] = privateScratchPerThreadSize; + createModuleFromMockBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + kernel->setGroupSize(4, 5, 6); + kernel->setGroupCount(3, 2, 1); + kernel->setGlobalOffsetExp(1, 2, 3); + kernel->patchGlobalOffset(); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + eventPoolDesc.count = 2; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.wait = 0; + eventDesc.signal = 0; + + struct MockEvent : public EventImp { + using EventImp::hostEventSetValueTimestamps; + using EventImp::isCompleted; + }; + ze_result_t returnValue; + std::unique_ptr eventPool = std::unique_ptr(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + std::unique_ptr event = std::unique_ptr(Event::create(eventPool.get(), &eventDesc, device)); + + std::array timestampData; + timestampData.fill(std::numeric_limits::max()); + static_cast(event.get())->hostEventSetValueTimestamps(0u); + + ze_result_t result = ZE_RESULT_SUCCESS; + ze_command_list_handle_t cmdListHandle; + ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC}; + queueDesc.ordinal = 0; + queueDesc.index = 0; + device->createCommandListImmediate(&queueDesc, &cmdListHandle); + + ze_group_count_t groupCount = {3, 2, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_event_handle_t eventHandles[1] = {event->toHandle()}; + EXPECT_FALSE(static_cast(event.get())->isCompleted); + + result = CommandList::fromHandle(cmdListHandle)->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 1, eventHandles, launchParams); + + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_TRUE(static_cast(event.get())->isCompleted); + + CommandList::fromHandle(cmdListHandle)->destroy(); +} + +HWTEST2_F(CmdlistAppendLaunchKernelTests, + givenEventWaitOnHostNumThreadsHigherThanNumThreadsWhenWaitForEventsFromHostThenReturnFalse, IsAtLeastXeHpCore) { + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EventWaitOnHost.set(1); + NEO::DebugManager.flags.EventWaitOnHostNumClients.set(0); + NEO::DebugManager.flags.EventWaitOnHostNumThreads.set(2); + EXPECT_EQ(NEO::SysCalls::getNumThreads(), 1u); + + ze_command_list_handle_t cmdListHandle; + ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC}; + queueDesc.ordinal = 0; + queueDesc.index = 0; + device->createCommandListImmediate(&queueDesc, &cmdListHandle); + + EXPECT_FALSE(static_cast *>(CommandList::fromHandle(cmdListHandle))->waitForEventsFromHost()); + + CommandList::fromHandle(cmdListHandle)->destroy(); +} + +HWTEST2_F(CmdlistAppendLaunchKernelTests, + givenEventWaitOnHostNumThreadsNotSetWhenWaitForEventsFromHostThenReturnFalse, IsAtLeastXeHpCore) { + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EventWaitOnHost.set(1); + NEO::DebugManager.flags.EventWaitOnHostNumClients.set(0); + EXPECT_EQ(NEO::SysCalls::getNumThreads(), 1u); + + ze_command_list_handle_t cmdListHandle; + ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC}; + queueDesc.ordinal = 0; + queueDesc.index = 0; + device->createCommandListImmediate(&queueDesc, &cmdListHandle); + + EXPECT_FALSE(static_cast *>(CommandList::fromHandle(cmdListHandle))->waitForEventsFromHost()); + + CommandList::fromHandle(cmdListHandle)->destroy(); +} + +HWTEST2_F(CmdlistAppendLaunchKernelTests, + givenEventWaitOnHostNumClientsNotSetWhenWaitForEventsFromHostThenReturnFalse, IsAtLeastXeHpCore) { + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EventWaitOnHost.set(1); + EXPECT_EQ(NEO::SysCalls::getNumThreads(), 1u); + + ze_command_list_handle_t cmdListHandle; + ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC}; + queueDesc.ordinal = 0; + queueDesc.index = 0; + device->createCommandListImmediate(&queueDesc, &cmdListHandle); + EXPECT_EQ(static_cast(CommandList::fromHandle(cmdListHandle)->cmdQImmediate)->getCsr()->getNumClients(), 1u); + + EXPECT_FALSE(static_cast *>(CommandList::fromHandle(cmdListHandle))->waitForEventsFromHost()); + + CommandList::fromHandle(cmdListHandle)->destroy(); +} + HWTEST2_F(CmdlistAppendLaunchKernelTests, givenImmediateCommandListUsesFlushTaskWhenDispatchingKernelWithSpillAndPrivateScratchSpaceThenExpectCsrHasCorrectValuesSet, IsAtLeastXeHpCore) { constexpr uint32_t scratchPerThreadSize = 0x200; diff --git a/opencl/test/unit_test/linux/main_linux_dll.cpp b/opencl/test/unit_test/linux/main_linux_dll.cpp index 993781bca8..6770723f93 100644 --- a/opencl/test/unit_test/linux/main_linux_dll.cpp +++ b/opencl/test/unit_test/linux/main_linux_dll.cpp @@ -806,6 +806,11 @@ TEST(SysCalls, WhenSysCallsFstatCalledThenCallIsRedirectedToOs) { EXPECT_EQ(0, result); } +TEST(SysCalls, WhenSysCallsGetNumThreadsCalledThenCallIsRedirectedToOs) { + auto result = NEO::SysCalls::getNumThreads(); + EXPECT_GT(result, 0u); +} + int main(int argc, char **argv) { bool useDefaultListener = false; bool enableAlarm = true; diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index bb34fce71e..8e53af9967 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -362,6 +362,16 @@ class CommandStreamReceiver { virtual void initializeDeviceWithFirstSubmission() = 0; + uint32_t getNumClients() { + return this->numClients.load(); + } + void registerClient() { + this->numClients++; + } + void unregisterClient() { + this->numClients--; + } + protected: void cleanupResources(); void printDeviceIndex(); @@ -428,6 +438,8 @@ class CommandStreamReceiver { // taskCount - # of tasks submitted std::atomic taskCount{0}; + std::atomic numClients = 0u; + DispatchMode dispatchMode = DispatchMode::ImmediateDispatch; SamplerCacheFlushState samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired; PreemptionMode lastPreemptionMode = PreemptionMode::Initial; diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index d420b54b41..e9b3d34726 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -282,6 +282,9 @@ DECLARE_DEBUG_VARIABLE(int32_t, MinHwThreadsUnoccupied, 0, "If not zero then max DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushEveryEnqueueCount, -1, "If greater than 0, driver performs implicit flush every N submissions.") DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForNewResource, -1, "-1: platform specific, 0: force disable, 1: force enable") DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForIdleGpu, -1, "-1: platform specific, 0: force disable, 1: force enable") +DECLARE_DEBUG_VARIABLE(int32_t, EventWaitOnHost, -1, "Wait for events on host instead of program semaphores for them, works for append kernel launch with immediate command list, -1: default, 0: disable, 1: enable") +DECLARE_DEBUG_VARIABLE(int32_t, EventWaitOnHostNumClients, -1, "Number of command queues created within csr from which event wait on host will be applied, -1: default=2, >=0: client count") +DECLARE_DEBUG_VARIABLE(int32_t, EventWaitOnHostNumThreads, -1, "Thread count from which event wait on host will be applied, -1: default=2, >=0: thread count") DECLARE_DEBUG_VARIABLE(int32_t, EnableCacheFlushAfterWalkerForAllQueues, -1, "Enable cache flush after walker even if queue doesn't require it") DECLARE_DEBUG_VARIABLE(int32_t, OverrideKernelSizeLimitForSmallDispatch, -1, "-1: default, >=0: on XEHP+ changes the threshold for treating kernel as small during NULL LWS selection") DECLARE_DEBUG_VARIABLE(int32_t, OverrideUseKmdWaitFunction, -1, "-1: default (L0: disabled), 0: disabled, 1: enabled. It uses only busy loop to wait or busy loop with KMD wait function, when KMD fallback is enabled") diff --git a/shared/source/os_interface/linux/sys_calls_linux.cpp b/shared/source/os_interface/linux/sys_calls_linux.cpp index b0c382a3c9..92e3e1568d 100644 --- a/shared/source/os_interface/linux/sys_calls_linux.cpp +++ b/shared/source/os_interface/linux/sys_calls_linux.cpp @@ -25,6 +25,12 @@ unsigned int getProcessId() { return getpid(); } +unsigned long getNumThreads() { + struct stat taskStat; + stat("/proc/self/task", &taskStat); + return taskStat.st_nlink - 2; +} + int close(int fileDescriptor) { return ::close(fileDescriptor); } diff --git a/shared/source/os_interface/sys_calls_common.h b/shared/source/os_interface/sys_calls_common.h index 413dae2fc4..ccbfde1863 100644 --- a/shared/source/os_interface/sys_calls_common.h +++ b/shared/source/os_interface/sys_calls_common.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,6 +13,8 @@ namespace SysCalls { unsigned int getProcessId(); +unsigned long getNumThreads(); + } // namespace SysCalls } // namespace NEO diff --git a/shared/source/os_interface/windows/sys_calls.cpp b/shared/source/os_interface/windows/sys_calls.cpp index 7db9096aa5..e4fd011215 100644 --- a/shared/source/os_interface/windows/sys_calls.cpp +++ b/shared/source/os_interface/windows/sys_calls.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -30,6 +30,10 @@ unsigned int getProcessId() { return GetCurrentProcessId(); } +unsigned long getNumThreads() { + return 1; +} + HANDLE createEvent(LPSECURITY_ATTRIBUTES lpEventAttributes, BOOL bManualReset, BOOL bInitialState, LPCSTR lpName) { return CreateEventA(lpEventAttributes, bManualReset, bInitialState, lpName); } diff --git a/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp b/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp index ba3d9e7c42..636016ef93 100644 --- a/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp +++ b/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp @@ -106,6 +106,10 @@ unsigned int getProcessId() { return 0xABCEDF; } +unsigned long getNumThreads() { + return 1; +} + int access(const char *pathName, int mode) { if (allowFakeDevicePath || strcmp(pathName, "/sys/dev/char/226:128") == 0) { return 0; diff --git a/shared/test/common/os_interface/windows/sys_calls.cpp b/shared/test/common/os_interface/windows/sys_calls.cpp index 1e8e960958..43009a8920 100644 --- a/shared/test/common/os_interface/windows/sys_calls.cpp +++ b/shared/test/common/os_interface/windows/sys_calls.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -19,6 +19,10 @@ unsigned int getProcessId() { return 0xABCEDF; } +unsigned long getNumThreads() { + return 1; +} + BOOL systemPowerStatusRetVal = 1; BYTE systemPowerStatusACLineStatusOverride = 1; const wchar_t *currentLibraryPath = L""; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 695005c88f..a1089ed9d0 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -324,6 +324,9 @@ EnableResourceTags = 0 SetKmdWaitTimeout = -1 OverrideNotifyEnableForTagUpdatePostSync = -1 OverrideUseKmdWaitFunction = -1 +EventWaitOnHost = -1 +EventWaitOnHostNumClients = -1 +EventWaitOnHostNumThreads = -1 EnableCacheFlushAfterWalkerForAllQueues = -1 Force32BitDriverSupport = -1 EnableCopyEngineSelector = -1 diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index ca86f3d884..0c44ef90be 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -87,6 +87,23 @@ HWTEST_F(CommandStreamReceiverTest, WhenCreatingCsrThenDefaultValuesAreSet) { EXPECT_FALSE(csr.isPreambleSent); } +HWTEST_F(CommandStreamReceiverTest, whenRegisterClientThenIncrementClientNum) { + auto &csr = pDevice->getUltCommandStreamReceiver(); + auto numClients = csr.getNumClients(); + + csr.registerClient(); + EXPECT_EQ(csr.getNumClients(), numClients + 1); + + csr.registerClient(); + EXPECT_EQ(csr.getNumClients(), numClients + 2); + + csr.unregisterClient(); + EXPECT_EQ(csr.getNumClients(), numClients + 1); + + csr.unregisterClient(); + EXPECT_EQ(csr.getNumClients(), numClients); +} + HWTEST_F(CommandStreamReceiverTest, WhenCreatingCsrThenTimestampTypeIs32b) { using ExpectedType = TimestampPackets;