diff --git a/level_zero/api/core/ze_barrier_api_entrypoints.h b/level_zero/api/core/ze_barrier_api_entrypoints.h index 446d3c25af..5e3037bd06 100644 --- a/level_zero/api/core/ze_barrier_api_entrypoints.h +++ b/level_zero/api/core/ze_barrier_api_entrypoints.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -36,6 +36,12 @@ ze_result_t zeDeviceSystemBarrier( return L0::Device::fromHandle(hDevice)->systemBarrier(); } +ze_result_t ZE_APICALL zeCommandListHostSynchronize( + ze_command_list_handle_t hCommandList, + uint64_t timeout) { + return L0::CommandList::fromHandle(hCommandList)->hostSynchronize(timeout); +} + } // namespace L0 extern "C" { @@ -74,4 +80,12 @@ ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceSystemBarrier( return L0::zeDeviceSystemBarrier( hDevice); } + +ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListHostSynchronize( + ze_command_list_handle_t hCommandList, + uint64_t timeout) { + return L0::zeCommandListHostSynchronize( + hCommandList, + timeout); +} } diff --git a/level_zero/api/core/ze_core_loader.cpp b/level_zero/api/core/ze_core_loader.cpp index 842d05f989..b4b349a666 100644 --- a/level_zero/api/core/ze_core_loader.cpp +++ b/level_zero/api/core/ze_core_loader.cpp @@ -339,6 +339,7 @@ zeGetCommandListProcAddrTable( pDdiTable->pfnAppendWriteGlobalTimestamp = L0::zeCommandListAppendWriteGlobalTimestamp; pDdiTable->pfnAppendMemoryCopyFromContext = L0::zeCommandListAppendMemoryCopyFromContext; pDdiTable->pfnAppendQueryKernelTimestamps = L0::zeCommandListAppendQueryKernelTimestamps; + pDdiTable->pfnHostSynchronize = L0::zeCommandListHostSynchronize; driverDdiTable.coreDdiTable.CommandList = *pDdiTable; if (driverDdiTable.enableTracing) { pDdiTable->pfnAppendBarrier = zeCommandListAppendBarrierTracing; diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 0059082590..af942c02eb 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -175,6 +175,7 @@ struct CommandList : _ze_command_list_handle_t { uint32_t data, ze_event_handle_t signalEventHandle) = 0; virtual ze_result_t appendWriteToMemory(void *desc, void *ptr, uint64_t data) = 0; + virtual ze_result_t hostSynchronize(uint64_t timeout) = 0; static CommandList *create(uint32_t productFamily, Device *device, NEO::EngineGroupType engineGroupType, ze_command_list_flags_t flags, ze_result_t &resultValue); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 823d87a4aa..5b867c0117 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -165,6 +165,7 @@ struct CommandListCoreFamily : CommandListImp { ze_result_t appendQueryKernelTimestamps(uint32_t numEvents, ze_event_handle_t *phEvents, void *dstptr, const size_t *pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; + ze_result_t hostSynchronize(uint64_t timeout) override; ze_result_t appendSignalEvent(ze_event_handle_t hEvent) override; ze_result_t appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) override; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index da451b481b..4b894413e2 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2472,6 +2472,11 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( return ZE_RESULT_SUCCESS; } +template +ze_result_t CommandListCoreFamily::hostSynchronize(uint64_t timeout) { + return ZE_RESULT_ERROR_INVALID_ARGUMENT; +} + template ze_result_t CommandListCoreFamily::reserveSpace(size_t size, void **ptr) { auto availableSpace = commandContainer.getCommandStream()->getAvailableSpace(); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 66c0f72b63..baf3d45d80 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -143,6 +143,8 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::appendLaunchCooperati return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent); } +template +ze_result_t CommandListCoreFamilyImmediate::hostSynchronize(uint64_t timeout) { + + if (this->isFlushTaskSubmissionEnabled && !this->isSyncModeQueue) { + const int64_t timeoutInMicroSeconds = timeout / 1000; + auto syncTaskCount = this->csr->peekTaskCount(); + const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutInMicroSeconds}, + syncTaskCount); + if (waitStatus == NEO::WaitStatus::GpuHang) { + this->printKernelsPrintfOutput(true); + this->checkAssert(); + return ZE_RESULT_ERROR_DEVICE_LOST; + } + this->csr->getInternalAllocationStorage()->cleanAllocationList(syncTaskCount, NEO::AllocationUsage::TEMPORARY_ALLOCATION); + this->printKernelsPrintfOutput(false); + this->checkAssert(); + } + return ZE_RESULT_SUCCESS; +} + template ze_result_t CommandListCoreFamilyImmediate::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, ze_event_handle_t hSignalEvent) { diff --git a/level_zero/core/test/black_box_tests/zello_immediate.cpp b/level_zero/core/test/black_box_tests/zello_immediate.cpp index ee55f580b0..60c7f9a14e 100644 --- a/level_zero/core/test/black_box_tests/zello_immediate.cpp +++ b/level_zero/core/test/black_box_tests/zello_immediate.cpp @@ -9,6 +9,7 @@ #include "zello_common.h" +#include #include #include #include @@ -31,12 +32,13 @@ void createImmediateCommandList(ze_device_handle_t &device, SUCCESS_OR_TERMINATE(zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdList)); } -void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, int32_t copyQueueGroup, bool &validRet) { +void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, int32_t copyQueueGroup, bool &validRet, bool useEventBasedSync) { const size_t allocSize = 4096 + 7; // +7 to brake alignment and make it harder char *hostBuffer = nullptr; void *deviceBuffer = nullptr; char *stackBuffer = new char[allocSize]; ze_command_list_handle_t cmdList; + const bool isEventsUsed = useEventBasedSync && !syncMode; createImmediateCommandList(device, context, copyQueueGroup, syncMode, cmdList); @@ -80,13 +82,17 @@ void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_ // Copy from device-allocated memory to stack SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, stackBuffer, deviceBuffer, allocSize, - syncMode ? nullptr : hostEvents[0], + isEventsUsed ? hostEvents[0] : nullptr, syncMode ? 0 : 1, syncMode ? nullptr : &deviceEvents[0])); if (!syncMode) { - // If Async mode, use event for sync - SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits::max())); + if (isEventsUsed) { + // If Async mode, use event for sync + SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits::max())); + } else { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize(cmdList, std::numeric_limits::max())); + } } // Validate stack and xe deviceBuffers have the original data from hostBuffer @@ -108,11 +114,12 @@ void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_ SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdList)); } -void executeGpuKernelAndValidate(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, bool &outputValidationSuccessful) { +void executeGpuKernelAndValidate(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, bool &outputValidationSuccessful, bool useEventBasedSync) { ze_command_list_handle_t cmdList; uint32_t computeOrdinal = getCommandQueueOrdinal(device); createImmediateCommandList(device, context, computeOrdinal, syncMode, cmdList); + const auto isEventsUsed = useEventBasedSync && !syncMode; // Create two shared buffers constexpr size_t allocSize = 4096; @@ -202,17 +209,25 @@ void executeGpuKernelAndValidate(ze_context_handle_t &context, ze_device_handle_ dispatchTraits.groupCountZ = 1u; SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchKernel(cmdList, kernel, &dispatchTraits, - syncMode ? nullptr : hostEvents[0], 0, nullptr)); + isEventsUsed ? hostEvents[0] : nullptr, 0, nullptr)); file.close(); } else { // Perform a GPU copy SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, dstBuffer, srcBuffer, allocSize, - syncMode ? nullptr : hostEvents[0], 0, nullptr)); + isEventsUsed ? hostEvents[0] : nullptr, 0, nullptr)); } if (!syncMode) { - // If Async mode, use event for sync - SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits::max())); + std::chrono::high_resolution_clock::time_point start, end; + start = std::chrono::high_resolution_clock::now(); + if (isEventsUsed) { + // If Async mode, use event for sync + SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits::max())); + } else { + SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize(cmdList, std::numeric_limits::max())); + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Time to synchronize : " << std::chrono::duration_cast(end - start).count(); } // Validate @@ -245,6 +260,7 @@ int main(int argc, char *argv[]) { verbose = isVerbose(argc, argv); bool aubMode = isAubMode(argc, argv); + int useEventBasedSync = getParamValue(argc, argv, "-e", "--useEventsBasedSync", 1); ze_context_handle_t context = nullptr; ze_driver_handle_t driverHandle = nullptr; @@ -259,12 +275,12 @@ int main(int argc, char *argv[]) { if (outputValidationSuccessful || aubMode) { // Sync mode with Compute queue std::cout << "Test case: Sync mode compute queue with Kernel launch \n"; - executeGpuKernelAndValidate(context, device, true, outputValidationSuccessful); + executeGpuKernelAndValidate(context, device, true, outputValidationSuccessful, useEventBasedSync); } if (outputValidationSuccessful || aubMode) { // Async mode with Compute queue std::cout << "\nTest case: Async mode compute queue with Kernel launch \n"; - executeGpuKernelAndValidate(context, device, false, outputValidationSuccessful); + executeGpuKernelAndValidate(context, device, false, outputValidationSuccessful, useEventBasedSync); } // Find copy queue in root device, if not found, try subdevices @@ -321,12 +337,12 @@ int main(int argc, char *argv[]) { if (outputValidationSuccessful || aubMode) { // Sync mode with Copy queue std::cout << "\nTest case: Sync mode copy queue for memory copy\n"; - testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, true, copyQueueGroup, outputValidationSuccessful); + testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, true, copyQueueGroup, outputValidationSuccessful, useEventBasedSync); } if (outputValidationSuccessful || aubMode) { // Async mode with Copy queue std::cout << "\nTest case: Async mode copy queue for memory copy\n"; - testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, false, copyQueueGroup, outputValidationSuccessful); + testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, false, copyQueueGroup, outputValidationSuccessful, useEventBasedSync); } } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 39dc70d196..cdcff7afe6 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -478,6 +478,8 @@ struct MockCommandList : public CommandList { ADDMETHOD_NOBASE_VOIDRETURN(appendMultiPartitionPrologue, (uint32_t partitionDataSize)); ADDMETHOD_NOBASE_VOIDRETURN(appendMultiPartitionEpilogue, (void)); + ADDMETHOD_NOBASE(hostSynchronize, ze_result_t, ZE_RESULT_SUCCESS, + (uint64_t timeout)); uint8_t *batchBuffer = nullptr; NEO::GraphicsAllocation *mockAllocation = nullptr; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp index f24031ebb2..1966ec9f5c 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp @@ -2949,5 +2949,80 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenAlignePtrToFillWhenAppendMemoryFil context->freeMem(dstBuffer); } +using ImmediateCommandListHostSynchronize = Test; + +HWTEST2_F(ImmediateCommandListHostSynchronize, givenFlushTaskEnabledAndNotSyncModeThenWaitForCompletionIsCalled, IsAtLeastSkl) { + MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; + cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; + cmdList.isFlushTaskSubmissionEnabled = true; + cmdList.isSyncModeQueue = false; + + EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_SUCCESS); + + uint32_t waitForFlushTagUpdateCalled = reinterpret_cast *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled; + EXPECT_EQ(waitForFlushTagUpdateCalled, 1u); +} + +HWTEST2_F(ImmediateCommandListHostSynchronize, givenSyncModeThenWaitForCompletionIsNotCalled, IsAtLeastSkl) { + MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; + cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; + cmdList.isFlushTaskSubmissionEnabled = true; + cmdList.isSyncModeQueue = true; + + EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_SUCCESS); + + uint32_t waitForFlushTagUpdateCalled = reinterpret_cast *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled; + EXPECT_EQ(waitForFlushTagUpdateCalled, 0u); +} + +HWTEST2_F(ImmediateCommandListHostSynchronize, givenFlushTaskSubmissionIsDisabledThenWaitForCompletionIsNotCalled, IsAtLeastSkl) { + MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; + cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; + cmdList.isFlushTaskSubmissionEnabled = false; + cmdList.isSyncModeQueue = false; + + EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_SUCCESS); + + uint32_t waitForFlushTagUpdateCalled = reinterpret_cast *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled; + EXPECT_EQ(waitForFlushTagUpdateCalled, 0u); +} + +HWTEST2_F(ImmediateCommandListHostSynchronize, givenGpuStatusIsHangThenDeviceLostIsReturned, IsAtLeastSkl) { + MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; + cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); + cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; + cmdList.isFlushTaskSubmissionEnabled = true; + cmdList.isSyncModeQueue = false; + reinterpret_cast *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false; + reinterpret_cast *>(cmdList.csr)->returnWaitForCompletionWithTimeout = WaitStatus::GpuHang; + + EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_ERROR_DEVICE_LOST); + + uint32_t waitForFlushTagUpdateCalled = reinterpret_cast *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled; + EXPECT_EQ(waitForFlushTagUpdateCalled, 1u); +} + +using CommandListHostSynchronize = Test; + +HWTEST2_F(CommandListHostSynchronize, whenHostSychronizeIsCalledReturnInvalidArgument, IsAtLeastSkl) { + ze_command_list_desc_t desc = {}; + ze_command_list_handle_t hCommandList = {}; + + ze_result_t result = context->createCommandList(device, &desc, &hCommandList); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(Context::fromHandle(CommandList::fromHandle(hCommandList)->getCmdListContext()), context); + + L0::CommandList *commandList = L0::CommandList::fromHandle(hCommandList); + EXPECT_EQ(commandList->hostSynchronize(0), ZE_RESULT_ERROR_INVALID_ARGUMENT); + commandList->destroy(); +} + } // namespace ult } // namespace L0