Wait for events from host
Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
parent
3323deb825
commit
b21fef96cb
|
@ -290,6 +290,7 @@ struct CommandList : _ze_command_list_handle_t {
|
|||
NEO::CommandStreamReceiver *csr = nullptr;
|
||||
Device *device = nullptr;
|
||||
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
|
||||
unsigned long numThreads = 1u;
|
||||
uint32_t cmdListType = CommandListType::TYPE_REGULAR;
|
||||
uint32_t commandListPerThreadScratchSize = 0u;
|
||||
uint32_t commandListPerThreadPrivateScratchSize = 0u;
|
||||
|
|
|
@ -138,6 +138,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
|||
bool isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc, bool allocFound);
|
||||
ze_result_t performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
void *obtainLockedPtrFromDevice(void *ptr, size_t size);
|
||||
bool waitForEventsFromHost();
|
||||
|
||||
protected:
|
||||
std::atomic<bool> barrierCalled{false};
|
||||
|
|
|
@ -206,6 +206,35 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImm
|
|||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::waitForEventsFromHost() {
|
||||
bool waitForEventsFromHostEnabled = false;
|
||||
if (NEO::DebugManager.flags.EventWaitOnHost.get() != -1) {
|
||||
waitForEventsFromHostEnabled = NEO::DebugManager.flags.EventWaitOnHost.get();
|
||||
}
|
||||
if (!waitForEventsFromHostEnabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto numClients = static_cast<CommandQueueImp *>(this->cmdQImmediate)->getCsr()->getNumClients();
|
||||
auto numClientsLimit = 2u;
|
||||
if (NEO::DebugManager.flags.EventWaitOnHostNumClients.get() != -1) {
|
||||
numClientsLimit = NEO::DebugManager.flags.EventWaitOnHostNumClients.get();
|
||||
}
|
||||
if (numClients < numClientsLimit) {
|
||||
return false;
|
||||
};
|
||||
auto numThreadsLimit = 2u;
|
||||
if (NEO::DebugManager.flags.EventWaitOnHostNumThreads.get() != -1) {
|
||||
numThreadsLimit = NEO::DebugManager.flags.EventWaitOnHostNumThreads.get();
|
||||
}
|
||||
if (this->numThreads < numThreadsLimit) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
|
||||
ze_kernel_handle_t kernelHandle, const ze_group_count_t *threadGroupDimensions,
|
||||
|
@ -215,7 +244,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
|
|||
if (this->isFlushTaskSubmissionEnabled) {
|
||||
checkAvailableSpace();
|
||||
}
|
||||
|
||||
if (waitForEventsFromHost()) {
|
||||
for (uint32_t i = 0; i < numWaitEvents; i++) {
|
||||
auto event = Event::fromHandle(phWaitEvents[i]);
|
||||
event->hostSynchronize(std::numeric_limits<uint64_t>::max());
|
||||
}
|
||||
numWaitEvents = 0u;
|
||||
phWaitEvents = nullptr;
|
||||
}
|
||||
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(kernelHandle, threadGroupDimensions,
|
||||
hSignalEvent, numWaitEvents, phWaitEvents,
|
||||
launchParams);
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "shared/source/helpers/engine_node_helper.h"
|
||||
#include "shared/source/helpers/logical_state_helper.h"
|
||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
#include "shared/source/os_interface/sys_calls_common.h"
|
||||
|
||||
#include "level_zero/core/source/cmdqueue/cmdqueue.h"
|
||||
#include "level_zero/core/source/device/device.h"
|
||||
|
@ -158,6 +159,8 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
|
|||
commandList->isBcsSplitNeeded = deviceImp->bcsSplit.setupDevice(productFamily, internalUsage, desc, csr);
|
||||
commandList->commandContainer.setImmediateCmdListCsr(csr);
|
||||
|
||||
commandList->numThreads = NEO::SysCalls::getNumThreads();
|
||||
|
||||
return commandList;
|
||||
}
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr
|
|||
}
|
||||
|
||||
ze_result_t CommandQueueImp::destroy() {
|
||||
this->csr->unregisterClient();
|
||||
delete this;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
@ -197,6 +198,7 @@ CommandQueue *CommandQueue::create(uint32_t productFamily, Device *device, NEO::
|
|||
}
|
||||
osContext.ensureContextInitialized();
|
||||
csr->initDirectSubmission();
|
||||
csr->registerClient();
|
||||
return commandQueue;
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
#include "shared/source/os_interface/hw_info_config.h"
|
||||
#include "shared/source/os_interface/sys_calls_common.h"
|
||||
#include "shared/test/common/cmd_parse/hw_parse.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
|
@ -521,6 +522,129 @@ HWTEST2_F(CmdlistAppendLaunchKernelTests,
|
|||
EXPECT_EQ(scratchPerThreadSize, ultCsr->requiredScratchSize);
|
||||
}
|
||||
|
||||
HWTEST2_F(CmdlistAppendLaunchKernelTests,
|
||||
givenEventWaitOnHostWhenAppendLaunchKernelWithEventWaitListThenHostSynchronize, IsAtLeastXeHpCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
NEO::DebugManager.flags.EventWaitOnHost.set(1);
|
||||
NEO::DebugManager.flags.EventWaitOnHostNumClients.set(0);
|
||||
NEO::DebugManager.flags.EventWaitOnHostNumThreads.set(0);
|
||||
|
||||
constexpr uint32_t scratchPerThreadSize = 0x200;
|
||||
constexpr uint32_t privateScratchPerThreadSize = 0x100;
|
||||
|
||||
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
|
||||
auto kernelDescriptor = mockKernelImmData->kernelDescriptor;
|
||||
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false;
|
||||
kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = scratchPerThreadSize;
|
||||
kernelDescriptor->kernelAttributes.perThreadScratchSize[1] = privateScratchPerThreadSize;
|
||||
createModuleFromMockBinary(0u, false, mockKernelImmData.get());
|
||||
|
||||
auto kernel = std::make_unique<MockKernel>(module.get());
|
||||
|
||||
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC};
|
||||
kernel->initialize(&kernelDesc);
|
||||
|
||||
kernel->setGroupSize(4, 5, 6);
|
||||
kernel->setGroupCount(3, 2, 1);
|
||||
kernel->setGlobalOffsetExp(1, 2, 3);
|
||||
kernel->patchGlobalOffset();
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
eventPoolDesc.count = 2;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.wait = 0;
|
||||
eventDesc.signal = 0;
|
||||
|
||||
struct MockEvent : public EventImp<uint32_t> {
|
||||
using EventImp<uint32_t>::hostEventSetValueTimestamps;
|
||||
using EventImp<uint32_t>::isCompleted;
|
||||
};
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<EventPool> eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
std::unique_ptr<Event> event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
std::array<uint32_t, 8u> timestampData;
|
||||
timestampData.fill(std::numeric_limits<uint32_t>::max());
|
||||
static_cast<MockEvent *>(event.get())->hostEventSetValueTimestamps(0u);
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
ze_command_list_handle_t cmdListHandle;
|
||||
ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||
queueDesc.ordinal = 0;
|
||||
queueDesc.index = 0;
|
||||
device->createCommandListImmediate(&queueDesc, &cmdListHandle);
|
||||
|
||||
ze_group_count_t groupCount = {3, 2, 1};
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
ze_event_handle_t eventHandles[1] = {event->toHandle()};
|
||||
EXPECT_FALSE(static_cast<MockEvent *>(event.get())->isCompleted);
|
||||
|
||||
result = CommandList::fromHandle(cmdListHandle)->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 1, eventHandles, launchParams);
|
||||
|
||||
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
EXPECT_TRUE(static_cast<MockEvent *>(event.get())->isCompleted);
|
||||
|
||||
CommandList::fromHandle(cmdListHandle)->destroy();
|
||||
}
|
||||
|
||||
HWTEST2_F(CmdlistAppendLaunchKernelTests,
|
||||
givenEventWaitOnHostNumThreadsHigherThanNumThreadsWhenWaitForEventsFromHostThenReturnFalse, IsAtLeastXeHpCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
NEO::DebugManager.flags.EventWaitOnHost.set(1);
|
||||
NEO::DebugManager.flags.EventWaitOnHostNumClients.set(0);
|
||||
NEO::DebugManager.flags.EventWaitOnHostNumThreads.set(2);
|
||||
EXPECT_EQ(NEO::SysCalls::getNumThreads(), 1u);
|
||||
|
||||
ze_command_list_handle_t cmdListHandle;
|
||||
ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||
queueDesc.ordinal = 0;
|
||||
queueDesc.index = 0;
|
||||
device->createCommandListImmediate(&queueDesc, &cmdListHandle);
|
||||
|
||||
EXPECT_FALSE(static_cast<L0::CommandListCoreFamilyImmediate<gfxCoreFamily> *>(CommandList::fromHandle(cmdListHandle))->waitForEventsFromHost());
|
||||
|
||||
CommandList::fromHandle(cmdListHandle)->destroy();
|
||||
}
|
||||
|
||||
HWTEST2_F(CmdlistAppendLaunchKernelTests,
|
||||
givenEventWaitOnHostNumThreadsNotSetWhenWaitForEventsFromHostThenReturnFalse, IsAtLeastXeHpCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
NEO::DebugManager.flags.EventWaitOnHost.set(1);
|
||||
NEO::DebugManager.flags.EventWaitOnHostNumClients.set(0);
|
||||
EXPECT_EQ(NEO::SysCalls::getNumThreads(), 1u);
|
||||
|
||||
ze_command_list_handle_t cmdListHandle;
|
||||
ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||
queueDesc.ordinal = 0;
|
||||
queueDesc.index = 0;
|
||||
device->createCommandListImmediate(&queueDesc, &cmdListHandle);
|
||||
|
||||
EXPECT_FALSE(static_cast<L0::CommandListCoreFamilyImmediate<gfxCoreFamily> *>(CommandList::fromHandle(cmdListHandle))->waitForEventsFromHost());
|
||||
|
||||
CommandList::fromHandle(cmdListHandle)->destroy();
|
||||
}
|
||||
|
||||
HWTEST2_F(CmdlistAppendLaunchKernelTests,
|
||||
givenEventWaitOnHostNumClientsNotSetWhenWaitForEventsFromHostThenReturnFalse, IsAtLeastXeHpCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
NEO::DebugManager.flags.EventWaitOnHost.set(1);
|
||||
EXPECT_EQ(NEO::SysCalls::getNumThreads(), 1u);
|
||||
|
||||
ze_command_list_handle_t cmdListHandle;
|
||||
ze_command_queue_desc_t queueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||
queueDesc.ordinal = 0;
|
||||
queueDesc.index = 0;
|
||||
device->createCommandListImmediate(&queueDesc, &cmdListHandle);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(CommandList::fromHandle(cmdListHandle)->cmdQImmediate)->getCsr()->getNumClients(), 1u);
|
||||
|
||||
EXPECT_FALSE(static_cast<L0::CommandListCoreFamilyImmediate<gfxCoreFamily> *>(CommandList::fromHandle(cmdListHandle))->waitForEventsFromHost());
|
||||
|
||||
CommandList::fromHandle(cmdListHandle)->destroy();
|
||||
}
|
||||
|
||||
HWTEST2_F(CmdlistAppendLaunchKernelTests,
|
||||
givenImmediateCommandListUsesFlushTaskWhenDispatchingKernelWithSpillAndPrivateScratchSpaceThenExpectCsrHasCorrectValuesSet, IsAtLeastXeHpCore) {
|
||||
constexpr uint32_t scratchPerThreadSize = 0x200;
|
||||
|
|
|
@ -806,6 +806,11 @@ TEST(SysCalls, WhenSysCallsFstatCalledThenCallIsRedirectedToOs) {
|
|||
EXPECT_EQ(0, result);
|
||||
}
|
||||
|
||||
TEST(SysCalls, WhenSysCallsGetNumThreadsCalledThenCallIsRedirectedToOs) {
|
||||
auto result = NEO::SysCalls::getNumThreads();
|
||||
EXPECT_GT(result, 0u);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
bool useDefaultListener = false;
|
||||
bool enableAlarm = true;
|
||||
|
|
|
@ -362,6 +362,16 @@ class CommandStreamReceiver {
|
|||
|
||||
virtual void initializeDeviceWithFirstSubmission() = 0;
|
||||
|
||||
uint32_t getNumClients() {
|
||||
return this->numClients.load();
|
||||
}
|
||||
void registerClient() {
|
||||
this->numClients++;
|
||||
}
|
||||
void unregisterClient() {
|
||||
this->numClients--;
|
||||
}
|
||||
|
||||
protected:
|
||||
void cleanupResources();
|
||||
void printDeviceIndex();
|
||||
|
@ -428,6 +438,8 @@ class CommandStreamReceiver {
|
|||
// taskCount - # of tasks submitted
|
||||
std::atomic<uint32_t> taskCount{0};
|
||||
|
||||
std::atomic<uint32_t> numClients = 0u;
|
||||
|
||||
DispatchMode dispatchMode = DispatchMode::ImmediateDispatch;
|
||||
SamplerCacheFlushState samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired;
|
||||
PreemptionMode lastPreemptionMode = PreemptionMode::Initial;
|
||||
|
|
|
@ -282,6 +282,9 @@ DECLARE_DEBUG_VARIABLE(int32_t, MinHwThreadsUnoccupied, 0, "If not zero then max
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushEveryEnqueueCount, -1, "If greater than 0, driver performs implicit flush every N submissions.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForNewResource, -1, "-1: platform specific, 0: force disable, 1: force enable")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForIdleGpu, -1, "-1: platform specific, 0: force disable, 1: force enable")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EventWaitOnHost, -1, "Wait for events on host instead of program semaphores for them, works for append kernel launch with immediate command list, -1: default, 0: disable, 1: enable")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EventWaitOnHostNumClients, -1, "Number of command queues created within csr from which event wait on host will be applied, -1: default=2, >=0: client count")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EventWaitOnHostNumThreads, -1, "Thread count from which event wait on host will be applied, -1: default=2, >=0: thread count")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableCacheFlushAfterWalkerForAllQueues, -1, "Enable cache flush after walker even if queue doesn't require it")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideKernelSizeLimitForSmallDispatch, -1, "-1: default, >=0: on XEHP+ changes the threshold for treating kernel as small during NULL LWS selection")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideUseKmdWaitFunction, -1, "-1: default (L0: disabled), 0: disabled, 1: enabled. It uses only busy loop to wait or busy loop with KMD wait function, when KMD fallback is enabled")
|
||||
|
|
|
@ -25,6 +25,12 @@ unsigned int getProcessId() {
|
|||
return getpid();
|
||||
}
|
||||
|
||||
unsigned long getNumThreads() {
|
||||
struct stat taskStat;
|
||||
stat("/proc/self/task", &taskStat);
|
||||
return taskStat.st_nlink - 2;
|
||||
}
|
||||
|
||||
int close(int fileDescriptor) {
|
||||
return ::close(fileDescriptor);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -13,6 +13,8 @@ namespace SysCalls {
|
|||
|
||||
unsigned int getProcessId();
|
||||
|
||||
unsigned long getNumThreads();
|
||||
|
||||
} // namespace SysCalls
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -30,6 +30,10 @@ unsigned int getProcessId() {
|
|||
return GetCurrentProcessId();
|
||||
}
|
||||
|
||||
unsigned long getNumThreads() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
HANDLE createEvent(LPSECURITY_ATTRIBUTES lpEventAttributes, BOOL bManualReset, BOOL bInitialState, LPCSTR lpName) {
|
||||
return CreateEventA(lpEventAttributes, bManualReset, bInitialState, lpName);
|
||||
}
|
||||
|
|
|
@ -106,6 +106,10 @@ unsigned int getProcessId() {
|
|||
return 0xABCEDF;
|
||||
}
|
||||
|
||||
unsigned long getNumThreads() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int access(const char *pathName, int mode) {
|
||||
if (allowFakeDevicePath || strcmp(pathName, "/sys/dev/char/226:128") == 0) {
|
||||
return 0;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -19,6 +19,10 @@ unsigned int getProcessId() {
|
|||
return 0xABCEDF;
|
||||
}
|
||||
|
||||
unsigned long getNumThreads() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
BOOL systemPowerStatusRetVal = 1;
|
||||
BYTE systemPowerStatusACLineStatusOverride = 1;
|
||||
const wchar_t *currentLibraryPath = L"";
|
||||
|
|
|
@ -324,6 +324,9 @@ EnableResourceTags = 0
|
|||
SetKmdWaitTimeout = -1
|
||||
OverrideNotifyEnableForTagUpdatePostSync = -1
|
||||
OverrideUseKmdWaitFunction = -1
|
||||
EventWaitOnHost = -1
|
||||
EventWaitOnHostNumClients = -1
|
||||
EventWaitOnHostNumThreads = -1
|
||||
EnableCacheFlushAfterWalkerForAllQueues = -1
|
||||
Force32BitDriverSupport = -1
|
||||
EnableCopyEngineSelector = -1
|
||||
|
|
|
@ -87,6 +87,23 @@ HWTEST_F(CommandStreamReceiverTest, WhenCreatingCsrThenDefaultValuesAreSet) {
|
|||
EXPECT_FALSE(csr.isPreambleSent);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandStreamReceiverTest, whenRegisterClientThenIncrementClientNum) {
|
||||
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
auto numClients = csr.getNumClients();
|
||||
|
||||
csr.registerClient();
|
||||
EXPECT_EQ(csr.getNumClients(), numClients + 1);
|
||||
|
||||
csr.registerClient();
|
||||
EXPECT_EQ(csr.getNumClients(), numClients + 2);
|
||||
|
||||
csr.unregisterClient();
|
||||
EXPECT_EQ(csr.getNumClients(), numClients + 1);
|
||||
|
||||
csr.unregisterClient();
|
||||
EXPECT_EQ(csr.getNumClients(), numClients);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandStreamReceiverTest, WhenCreatingCsrThenTimestampTypeIs32b) {
|
||||
using ExpectedType = TimestampPackets<typename FamilyType::TimestampPacketType>;
|
||||
|
||||
|
|
Loading…
Reference in New Issue