diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index df908fd7ec..8356dc16e3 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -203,7 +203,8 @@ struct CommandListCoreFamily : CommandListImp { const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent, bool isIndirect, - bool isPredicate); + bool isPredicate, + bool isCooperative); ze_result_t appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent); ze_result_t prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions); @@ -211,6 +212,7 @@ struct CommandListCoreFamily : CommandListImp { const void **pRanges); ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]); + ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t *pThreadGroupDimensions); void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb); void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker); void appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 0e9ac7127d..8e10274ab8 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -24,6 +24,8 @@ #include "shared/source/memory_manager/allocation_properties.h" #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/memory_manager.h" +#include "shared/source/program/sync_buffer_handler.h" +#include "shared/source/program/sync_buffer_handler.inl" #include "level_zero/core/source/cmdlist/cmdlist_hw.h" #include "level_zero/core/source/device/device_imp.h" @@ -146,7 +148,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernel(ze_kernel_h } return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, - hEvent, false, false); + hEvent, false, false, false); } template @@ -156,7 +158,13 @@ ze_result_t CommandListCoreFamily::appendLaunchCooperativeKernel( uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { - return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents); + if (ret) { + return ret; + } + + return appendLaunchKernelWithParams(hKernel, pLaunchFuncArgs, + hSignalEvent, false, false, true); } template @@ -172,7 +180,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelIndirect(ze_ } appendEventForProfiling(hEvent, true); ret = appendLaunchKernelWithParams(hKernel, pDispatchArgumentsBuffer, - nullptr, true, false); + nullptr, true, false, false); appendSignalEventPostWalker(hEvent); return ret; @@ -203,7 +211,7 @@ ze_result_t CommandListCoreFamily::appendLaunchMultipleKernelsInd ret = appendLaunchKernelWithParams(phKernels[i], haveLaunchArguments ? &pLaunchArgumentsBuffer[i] : nullptr, - nullptr, true, true); + nullptr, true, true, false); if (ret) { return ret; } @@ -668,7 +676,7 @@ template ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent) { - return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false); + return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false); } template @@ -850,7 +858,7 @@ ze_result_t CommandListCoreFamily::appendPageFaultCopy(NEO::Graph ze_group_count_t dispatchFuncArgs{groups, 1u, 1u}; ze_result_t ret = appendLaunchKernelWithParams(builtinFunction->toHandle(), &dispatchFuncArgs, - nullptr, false, false); + nullptr, false, false, false); if (ret != ZE_RESULT_SUCCESS) { return ret; } @@ -1539,6 +1547,30 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu return ZE_RESULT_SUCCESS; } +template +ze_result_t CommandListCoreFamily::programSyncBuffer(Kernel &kernel, NEO::Device &device, + const ze_group_count_t *pThreadGroupDimensions) { + auto &hwInfo = device.getHardwareInfo(); + auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); + if (!hwHelper.isCooperativeDispatchSupported(this->engineGroupType, hwInfo.platform.eProductFamily)) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + uint32_t maximalNumberOfWorkgroupsAllowed; + auto ret = kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed); + UNRECOVERABLE_IF(ret != ZE_RESULT_SUCCESS); + size_t requestedNumberOfWorkgroups = (pThreadGroupDimensions->groupCountX * pThreadGroupDimensions->groupCountY * + pThreadGroupDimensions->groupCountZ); + if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) { + return ZE_RESULT_ERROR_INVALID_ARGUMENT; + } + + device.allocateSyncBufferHandler(); + device.syncBufferHandler->prepareForEnqueue(requestedNumberOfWorkgroups, kernel); + + return ZE_RESULT_SUCCESS; +} + template void CommandListCoreFamily::appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb) { constexpr uint32_t mask = 0xfffffffe; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 003d1d5633..c6642303a0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -37,7 +37,8 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent, bool isIndirect, - bool isPredicate) { + bool isPredicate, + bool isCooperative) { const auto kernel = Kernel::fromHandle(hKernel); UNRECOVERABLE_IF(kernel == nullptr); appendEventForProfiling(hEvent, true); @@ -78,6 +79,15 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z this->indirectAllocationsAllowed = true; } + if (kernel->usesSyncBuffer()) { + auto retVal = (isCooperative + ? programSyncBuffer(*kernel, *device->getNEODevice(), pThreadGroupDimensions) + : ZE_RESULT_ERROR_INVALID_ARGUMENT); + if (retVal) { + return retVal; + } + } + KernelImp *kernelImp = static_cast(kernel); this->containsStatelessUncachedResource |= kernelImp->getKernelRequiresUncachedMocs(); uint32_t partitionCount = 0; diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h index 2b1a3b9967..e96c30fd19 100644 --- a/level_zero/core/source/kernel/kernel.h +++ b/level_zero/core/source/kernel/kernel.h @@ -132,6 +132,9 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI { virtual NEO::GraphicsAllocation *getPrintfBufferAllocation() = 0; virtual void printPrintfOutput() = 0; + virtual bool usesSyncBuffer() = 0; + virtual void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0; + Kernel() = default; Kernel(const Kernel &) = delete; Kernel(Kernel &&) = delete; diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 6b1589b4f1..841478cc27 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -774,6 +774,17 @@ void KernelImp::printPrintfOutput() { PrintfHandler::printOutput(kernelImmData, this->printfBuffer, module->getDevice()); } +bool KernelImp::usesSyncBuffer() { + return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer; +} + +void KernelImp::patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) { + this->residencyContainer.push_back(gfxAllocation); + NEO::patchPointer(ArrayRef(crossThreadData.get(), crossThreadDataSize), + this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress, + static_cast(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset))); +} + void KernelImp::setDebugSurface() { auto device = module->getDevice(); if (module->isDebugEnabled() && device->getNEODevice()->getDebugger()) { diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index a2d7bda37c..0a71e3ea8a 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -16,8 +16,6 @@ namespace L0 { -struct GraphicsAllocation; - struct KernelImp : Kernel { KernelImp(Module *module); @@ -82,6 +80,9 @@ struct KernelImp : Kernel { NEO::GraphicsAllocation *getPrintfBufferAllocation() override { return this->printfBuffer; } void printPrintfOutput() override; + bool usesSyncBuffer() override; + void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override; + const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); } uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 8b312a9a43..5bb7777a87 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -37,6 +37,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::applyMemoryRangesBarrier; using BaseClass::commandListPerThreadScratchSize; using BaseClass::commandListPreemptionMode; + using BaseClass::engineGroupType; using BaseClass::getAlignedAllocation; using BaseClass::getAllocationFromHostPtrMap; using BaseClass::getHostPtrAlloc; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp index 796e60c40c..3390838674 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp @@ -18,6 +18,8 @@ #include "level_zero/core/test/unit_tests/fixtures/module_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h" +#include "level_zero/core/test/unit_tests/mocks/mock_module.h" + namespace L0 { namespace ult { @@ -250,7 +252,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, WhenAppendingFunctionThenUsedCmdBufferS auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed(); - auto result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false); + auto result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed(); @@ -260,7 +262,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, WhenAppendingFunctionThenUsedCmdBufferS sizeBefore = commandList->commandContainer.getCommandStream()->getUsed(); - result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, true, false); + result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, true, false, false); ASSERT_EQ(ZE_RESULT_SUCCESS, result); sizeAfter = commandList->commandContainer.getCommandStream()->getUsed(); @@ -1009,5 +1011,59 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandListAppendLaunchKernel, givenAppendLaunchMult device->getDriverHandle()->freeMem(reinterpret_cast(numLaunchArgs)); } +HWTEST_F(CommandListAppendLaunchKernel, givenInvalidEventListWhenAppendLaunchCooperativeKernelIsCalledThenErrorIsReturned) { + createKernel(); + + ze_group_count_t groupCount{1, 1, 1}; + ze_result_t returnValue; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue)); + returnValue = commandList->appendLaunchCooperativeKernel(kernel->toHandle(), &groupCount, nullptr, 1, nullptr); + + EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, returnValue); +} + +HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLaunchCooperativeKernelIsCalledThenCorrectValueIsReturned, SklPlusMatcher) { + Mock<::L0::Kernel> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + kernel.setGroupSize(4, 1, 1); + ze_group_count_t groupCount{8, 1, 1}; + auto pCommandList = std::make_unique>>(); + auto result = pCommandList->initialize(device, NEO::EngineGroupType::Compute); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + auto &kernelAttributes = kernel.immutableData.kernelDescriptor->kernelAttributes; + kernelAttributes.flags.usesSyncBuffer = true; + kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber; + bool isCooperative = true; + result = pCommandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + { + VariableBackup usesSyncBuffer{&kernelAttributes.flags.packed}; + usesSyncBuffer = false; + result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + } + { + VariableBackup groupCountX{&groupCount.groupCountX}; + uint32_t maximalNumberOfWorkgroupsAllowed; + kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed); + groupCountX = maximalNumberOfWorkgroupsAllowed + 1; + result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative); + EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result); + } + { + VariableBackup cooperative{&isCooperative}; + cooperative = false; + result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative); + EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result); + } +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp index e4a55b8f1f..4cb01c6183 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp @@ -49,7 +49,8 @@ class AppendFillFixture : public DeviceFixture, public ::testing::Test { const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent, bool isIndirect, - bool isPredicate) override { + bool isPredicate, + bool isCooperative) override { if (numberOfCallsToAppendLaunchKernelWithParams == thresholdOfCallsToAppendLaunchKernelWithParamsToFail) { return ZE_RESULT_ERROR_UNKNOWN; } @@ -59,7 +60,8 @@ class AppendFillFixture : public DeviceFixture, public ::testing::Test { pThreadGroupDimensions, hEvent, isIndirect, - isPredicate); + isPredicate, + isCooperative); } uint32_t thresholdOfCallsToAppendLaunchKernelWithParamsToFail = std::numeric_limits::max(); diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index cfbf53ccbd..f0d7e0396b 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -5892,7 +5892,8 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue, auto rootDeviceIndex = device.getRootDeviceIndex(); auto &hardwareInfo = device.getHardwareInfo(); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); - if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) { + auto engineGroupType = hwHelper.getEngineGroupType(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo); + if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, hardwareInfo.platform.eProductFamily)) { retVal = CL_INVALID_COMMAND_QUEUE; return retVal; } @@ -5921,7 +5922,7 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue, return retVal; } - device.allocateSyncBufferHandler(); + device.getDevice().allocateSyncBufferHandler(); } if (!pCommandQueue->validateCapabilityForOperation(CL_QUEUE_CAPABILITY_KERNEL_INTEL, numEventsInWaitList, eventWaitList, event)) { diff --git a/opencl/source/cl_device/cl_device.cpp b/opencl/source/cl_device/cl_device.cpp index 857edd11be..76b356d126 100644 --- a/opencl/source/cl_device/cl_device.cpp +++ b/opencl/source/cl_device/cl_device.cpp @@ -15,7 +15,6 @@ #include "shared/source/helpers/string.h" #include "shared/source/os_interface/driver_info.h" #include "shared/source/os_interface/os_interface.h" -#include "shared/source/program/sync_buffer_handler.h" #include "shared/source/source_level_debugger/source_level_debugger.h" #include "opencl/source/helpers/cl_hw_helper.h" @@ -68,7 +67,6 @@ ClDevice::~ClDevice() { getSourceLevelDebugger()->notifyDeviceDestruction(); } - syncBufferHandler.reset(); for (auto &subDevice : subDevices) { subDevice.reset(); } @@ -98,14 +96,6 @@ bool ClDevice::isOcl21Conformant() const { hwInfo.capabilityTable.supportsPipes && hwInfo.capabilityTable.supportsIndependentForwardProgress); } -void ClDevice::allocateSyncBufferHandler() { - TakeOwnershipWrapper lock(*this); - if (syncBufferHandler.get() == nullptr) { - syncBufferHandler = std::make_unique(this->getDevice()); - UNRECOVERABLE_IF(syncBufferHandler.get() == nullptr); - } -} - void ClDevice::retainApi() { auto parentDeviceId = deviceInfo.parentDevice; if (parentDeviceId) { diff --git a/opencl/source/cl_device/cl_device.h b/opencl/source/cl_device/cl_device.h index 95f7430400..1d1f49ceb2 100644 --- a/opencl/source/cl_device/cl_device.h +++ b/opencl/source/cl_device/cl_device.h @@ -31,7 +31,6 @@ class MemoryManager; class PerformanceCounters; class Platform; class SourceLevelDebugger; -class SyncBufferHandler; struct DeviceInfo; struct EngineControl; struct HardwareCapabilities; @@ -77,7 +76,6 @@ class ClDevice : public BaseObject<_cl_device_id> { double getPlatformHostTimerResolution() const; bool isSimulation() const; GFXCORE_FAMILY getRenderCoreFamily() const; - void allocateSyncBufferHandler(); PerformanceCounters *getPerformanceCounters(); PreemptionMode getPreemptionMode() const; bool isDebuggerActive() const; @@ -119,7 +117,6 @@ class ClDevice : public BaseObject<_cl_device_id> { ClDevice *getDeviceById(uint32_t deviceId); const std::string &peekCompilerExtensions() const; const std::string &peekCompilerExtensionsWithFeatures() const; - std::unique_ptr syncBufferHandler; DeviceBitfield getDeviceBitfield() const; bool isDeviceEnqueueSupported() const; bool arePipesSupported() const; diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 010882d18e..c2a0a7ddef 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -15,6 +15,7 @@ #include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/program/sync_buffer_handler.h" +#include "shared/source/program/sync_buffer_handler.inl" #include "shared/source/utilities/range.h" #include "shared/source/utilities/tag_allocator.h" @@ -404,7 +405,7 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize(); size_t workGroupsCount = (gws.x * gws.y * gws.z) / (lws.x * lws.y * lws.z); - device->syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel()); + device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel()); } if (commandType == CL_COMMAND_NDRANGE_KERNEL) { @@ -685,7 +686,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( auto rootDeviceIndex = device->getRootDeviceIndex(); if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(rootDeviceIndex)) { - device->syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver()); + device->getDevice().syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver()); } if (timestampPacketContainer) { diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 19af26aab5..ad8b2ecc9e 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1117,7 +1117,8 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local auto &hardwareInfo = getHardwareInfo(rootDeviceIndex); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); - if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) { + auto engineGroupType = hwHelper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo); + if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, hardwareInfo.platform.eProductFamily)) { return 0; } diff --git a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp index 491006cd0c..53304de7ca 100644 --- a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp @@ -547,7 +547,7 @@ HWTEST_F(EnqueueHandlerTest, givenKernelUsingSyncBufferWhenEnqueuingKernelThenSs sPatchBindingTableState.Count = 1; sPatchBindingTableState.SurfaceStateOffset = 0; - pClDevice->allocateSyncBufferHandler(); + pDevice->allocateSyncBufferHandler(); size_t offset = 0; size_t size = 1; @@ -591,7 +591,7 @@ HWTEST_F(EnqueueHandlerTest, givenKernelUsingSyncBufferWhenEnqueuingKernelThenSs hwParser.parseCommands(*mockCmdQ); auto &surfaceState = hwParser.getSurfaceState(&surfaceStateHeap, 0); - auto pSyncBufferHandler = static_cast(pClDevice->syncBufferHandler.get()); + auto pSyncBufferHandler = static_cast(pDevice->syncBufferHandler.get()); EXPECT_EQ(pSyncBufferHandler->graphicsAllocation->getGpuAddress(), surfaceState.getSurfaceBaseAddress()); } } diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp index e73edaf3c1..faeae8cf8a 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp @@ -218,7 +218,8 @@ using clEnqueueNDCountKernelTests = api_tests; TEST_F(clEnqueueNDCountKernelTests, GivenQueueIncapableWhenEnqueuingNDCountKernelINTELThenInvalidOperationIsReturned) { auto &hwHelper = HwHelper::get(::defaultHwInfo->platform.eRenderCoreFamily); - if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), ::defaultHwInfo->platform.eProductFamily)) { + auto engineGroupType = hwHelper.getEngineGroupType(pCommandQueue->getGpgpuEngine().getEngineType(), *::defaultHwInfo); + if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, ::defaultHwInfo->platform.eProductFamily)) { GTEST_SKIP(); } @@ -250,7 +251,8 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreSetThenClEnqueueNDCountKernel CommandQueue *pCmdQ2 = createCommandQueue(pClDevice); HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); - if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + auto engineGroupType = hwHelper.getEngineGroupType(pCmdQ2->getGpgpuEngine().getEngineType(), hardwareInfo); + if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext; } @@ -296,7 +298,8 @@ TEST_F(EnqueueKernelTest, givenKernelWhenNotAllArgsAreSetButSetKernelArgIsCalled CommandQueue *pCmdQ2 = createCommandQueue(pClDevice); HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); - if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + auto engineGroupType = hwHelper.getEngineGroupType(pCmdQ2->getGpgpuEngine().getEngineType(), hardwareInfo); + if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext; } @@ -342,7 +345,8 @@ TEST_F(EnqueueKernelTest, givenKernelWhenSetKernelArgIsCalledForEachArgButAtLeas CommandQueue *pCmdQ2 = createCommandQueue(pClDevice); HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); - if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + auto engineGroupType = hwHelper.getEngineGroupType(pCmdQ2->getGpgpuEngine().getEngineType(), hardwareInfo); + if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext; } diff --git a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp index 1f38106a62..1c0102812b 100644 --- a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp +++ b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp @@ -6,7 +6,7 @@ */ #include "shared/source/program/sync_buffer_handler.h" -#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/mocks/ult_device_factory.h" #include "opencl/source/api/api.h" #include "opencl/test/unit_test/fixtures/enqueue_handler_fixture.h" @@ -86,7 +86,7 @@ class SyncBufferHandlerTest : public SyncBufferEnqueueHandlerTest { } MockSyncBufferHandler *getSyncBufferHandler() { - return reinterpret_cast(pClDevice->syncBufferHandler.get()); + return reinterpret_cast(pDevice->syncBufferHandler.get()); } cl_int enqueueNDCount() { @@ -94,7 +94,8 @@ class SyncBufferHandlerTest : public SyncBufferEnqueueHandlerTest { } bool isCooperativeDispatchSupported() { - return hwHelper->isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), commandQueue->getDevice().getHardwareInfo().platform.eProductFamily); + auto engineGroupType = hwHelper->getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo); + return hwHelper->isCooperativeDispatchSupported(engineGroupType, commandQueue->getDevice().getHardwareInfo().platform.eProductFamily); } const cl_uint workDim = 1; @@ -157,7 +158,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingCon HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingConcurrentKernelThenErrorIsReturned) { size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue); workgroupCount[0] = maxWorkGroupCount + 1; - globalWorkSize[0] = maxWorkGroupCount * lws[0] + 1; + globalWorkSize[0] = maxWorkGroupCount * lws[0]; auto retVal = enqueueNDCount(); EXPECT_EQ(CL_INVALID_VALUE, retVal); @@ -180,7 +181,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer kernelInternals->kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindfulAndStateless; patchAllocateSyncBuffer(); - pClDevice->allocateSyncBufferHandler(); + pDevice->allocateSyncBufferHandler(); auto syncBufferHandler = getSyncBufferHandler(); auto surfaceState = reinterpret_cast(ptrOffset(kernel->getSurfaceStateHeap(rootDeviceIndex), sPatchAllocateSyncBuffer.SurfaceStateHeapOffset)); @@ -196,7 +197,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer TEST(SyncBufferHandlerDeviceTest, GivenRootDeviceWhenAllocateSyncBufferIsCalledTwiceThenTheObjectIsCreatedOnlyOnce) { const size_t testUsedBufferSize = 100; - MockClDevice rootDevice{new MockDevice}; + MockDevice rootDevice; rootDevice.allocateSyncBufferHandler(); auto syncBufferHandler = reinterpret_cast(rootDevice.syncBufferHandler.get()); @@ -210,20 +211,17 @@ TEST(SyncBufferHandlerDeviceTest, GivenRootDeviceWhenAllocateSyncBufferIsCalledT } TEST(SyncBufferHandlerDeviceTest, GivenSubDeviceWhenAllocateSyncBufferIsCalledTwiceThenTheObjectIsCreatedOnlyOnce) { - DebugManagerStateRestore restorer; - DebugManager.flags.CreateMultipleSubDevices.set(2); - VariableBackup mockDeviceFlagBackup(&MockDevice::createSingleDevice, false); - auto rootDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); - auto &subDevice = rootDevice->subDevices[0]; - subDevice->allocateSyncBufferHandler(); - auto syncBufferHandler = reinterpret_cast(subDevice->syncBufferHandler.get()); + UltDeviceFactory ultDeviceFactory{1, 2}; + auto pSubDevice = ultDeviceFactory.subDevices[0]; + pSubDevice->allocateSyncBufferHandler(); + auto syncBufferHandler = reinterpret_cast(pSubDevice->syncBufferHandler.get()); const size_t testUsedBufferSize = 100; ASSERT_NE(syncBufferHandler->usedBufferSize, testUsedBufferSize); syncBufferHandler->usedBufferSize = testUsedBufferSize; - subDevice->allocateSyncBufferHandler(); - syncBufferHandler = reinterpret_cast(subDevice->syncBufferHandler.get()); + pSubDevice->allocateSyncBufferHandler(); + syncBufferHandler = reinterpret_cast(pSubDevice->syncBufferHandler.get()); EXPECT_EQ(testUsedBufferSize, syncBufferHandler->usedBufferSize); } diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp index 7782af24fa..777a32cc83 100644 --- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp +++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp @@ -921,8 +921,10 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT size_t localWorkSize[3] = {1, 1, 1}; CommandQueue *commandQueue = nullptr; WithCastToInternal(cmdQ, &commandQueue); - HwHelper &hwHelper = HwHelper::get(pDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); - if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), pDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + auto &hwInfo = pDevice->getDevice().getHardwareInfo(); + HwHelper &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); + auto engineGroupType = hwHelper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(), hwInfo); + if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { commandQueue->getGpgpuEngine().osContext = commandQueue->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext; } size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize, commandQueue); diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp index 862eebf345..9d219cfe86 100644 --- a/shared/source/device/device.cpp +++ b/shared/source/device/device.cpp @@ -44,6 +44,7 @@ Device::~Device() { engine.commandStreamReceiver->flushBatchedSubmissions(); } + syncBufferHandler.reset(); commandStreamReceivers.clear(); executionEnvironment->memoryManager->waitForDeletions(); @@ -308,6 +309,15 @@ GmmClientContext *Device::getGmmClientContext() const { return getGmmHelper()->getClientContext(); } +void Device::allocateSyncBufferHandler() { + static std::mutex mutex; + std::unique_lock lock(mutex); + if (syncBufferHandler.get() == nullptr) { + syncBufferHandler = std::make_unique(*this); + UNRECOVERABLE_IF(syncBufferHandler.get() == nullptr); + } +} + uint64_t Device::getGlobalMemorySize(uint32_t deviceBitfield) const { auto globalMemorySize = getMemoryManager()->isLocalMemorySupported(this->getRootDeviceIndex()) ? getMemoryManager()->getLocalMemorySize(this->getRootDeviceIndex(), deviceBitfield) diff --git a/shared/source/device/device.h b/shared/source/device/device.h index e2dbd6edd2..c6a2449679 100644 --- a/shared/source/device/device.h +++ b/shared/source/device/device.h @@ -15,6 +15,7 @@ #include "shared/source/helpers/engine_control.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/hw_info.h" +#include "shared/source/program/sync_buffer_handler.h" #include "opencl/source/os_interface/performance_counters.h" @@ -92,6 +93,7 @@ class Device : public ReferenceTrackedObject { } MOCKABLE_VIRTUAL CompilerInterface *getCompilerInterface() const; BuiltIns *getBuiltIns() const; + void allocateSyncBufferHandler(); virtual uint32_t getRootDeviceIndex() const = 0; virtual uint32_t getNumAvailableDevices() const = 0; @@ -101,6 +103,7 @@ class Device : public ReferenceTrackedObject { virtual BindlessHeapsHelper *getBindlessHeapsHelper() const = 0; static decltype(&PerformanceCounters::create) createPerformanceCountersFunc; + std::unique_ptr syncBufferHandler; protected: Device() = delete; diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 04e1590d39..906925f33a 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -124,7 +124,7 @@ class HwHelper { virtual bool useOnlyGlobalTimestamps() const = 0; virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0; virtual bool packedFormatsSupported() const = 0; - virtual bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const = 0; + virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const PRODUCT_FAMILY productFamily) const = 0; virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0; virtual bool isMediaBlockIOSupported(const HardwareInfo &hwInfo) const = 0; virtual bool isCopyOnlyEngineType(EngineGroupType type) const = 0; @@ -325,7 +325,7 @@ class HwHelperHw : public HwHelper { bool packedFormatsSupported() const override; - bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const override; + bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const PRODUCT_FAMILY productFamily) const override; size_t getMaxFillPaternSizeForCopyEngine() const override; diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index 49ed996927..f9de825950 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -536,7 +536,7 @@ bool MemorySynchronizationCommands::isPipeControlPriorToPipelineSelec } template -bool HwHelperHw::isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const { +bool HwHelperHw::isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const PRODUCT_FAMILY productFamily) const { return true; } diff --git a/shared/source/program/CMakeLists.txt b/shared/source/program/CMakeLists.txt index a619d9281b..5d001f178c 100644 --- a/shared/source/program/CMakeLists.txt +++ b/shared/source/program/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2019-2020 Intel Corporation +# Copyright (C) 2019-2021 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -16,6 +16,7 @@ set(NEO_CORE_PROGRAM ${CMAKE_CURRENT_SOURCE_DIR}/program_initialization.h ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h + ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.inl ) set_property(GLOBAL PROPERTY NEO_CORE_PROGRAM ${NEO_CORE_PROGRAM}) diff --git a/shared/source/program/sync_buffer_handler.cpp b/shared/source/program/sync_buffer_handler.cpp index bd46449c8d..d297a72592 100644 --- a/shared/source/program/sync_buffer_handler.cpp +++ b/shared/source/program/sync_buffer_handler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -11,8 +11,6 @@ #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/memory_manager.h" -#include "opencl/source/kernel/kernel.h" - namespace NEO { SyncBufferHandler::~SyncBufferHandler() { @@ -24,22 +22,6 @@ SyncBufferHandler::SyncBufferHandler(Device &device) allocateNewBuffer(); } -void SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, Kernel &kernel) { - auto requiredSize = workGroupsCount; - std::lock_guard guard(this->mutex); - - bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize); - if (isCurrentBufferFull) { - memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation); - allocateNewBuffer(); - usedBufferSize = 0; - } - - kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize); - - usedBufferSize += requiredSize; -} - void SyncBufferHandler::makeResident(CommandStreamReceiver &csr) { csr.makeResident(*graphicsAllocation); } diff --git a/shared/source/program/sync_buffer_handler.h b/shared/source/program/sync_buffer_handler.h index 29de7dac4b..9a27d467ab 100644 --- a/shared/source/program/sync_buffer_handler.h +++ b/shared/source/program/sync_buffer_handler.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,8 +17,8 @@ class CommandStreamReceiver; class Context; class Device; class GraphicsAllocation; -class MemoryManager; class Kernel; +class MemoryManager; class SyncBufferHandler { public: @@ -26,7 +26,8 @@ class SyncBufferHandler { SyncBufferHandler(Device &device); - void prepareForEnqueue(size_t workGroupsCount, Kernel &kernel); + template + void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel); void makeResident(CommandStreamReceiver &csr); protected: diff --git a/shared/source/program/sync_buffer_handler.inl b/shared/source/program/sync_buffer_handler.inl new file mode 100644 index 0000000000..1e7d5d7dba --- /dev/null +++ b/shared/source/program/sync_buffer_handler.inl @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/memory_manager/memory_manager.h" + +template +void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) { + auto requiredSize = workGroupsCount; + std::lock_guard guard(this->mutex); + + bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize); + if (isCurrentBufferFull) { + memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation); + allocateNewBuffer(); + usedBufferSize = 0; + } + + kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize); + + usedBufferSize += requiredSize; +}