diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 77a099022d..f552623f04 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -5541,7 +5541,9 @@ cl_int CL_API_CALL clGetKernelMaxConcurrentWorkGroupCountINTEL(cl_command_queue return retVal; } - *suggestedWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize); + CommandQueue *pCommandQueue = nullptr; + WithCastToInternal(commandQueue, &pCommandQueue); + *suggestedWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue); return retVal; } @@ -5579,6 +5581,13 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue, return retVal; } + auto &hardwareInfo = pKernel->getDevice().getHardwareInfo(); + auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); + if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) { + retVal = CL_INVALID_COMMAND_QUEUE; + return retVal; + } + size_t globalWorkSize[3]; for (size_t i = 0; i < workDim; i++) { globalWorkSize[i] = workgroupCount[i] * localWorkSize[i]; @@ -5589,7 +5598,7 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue, for (size_t i = 0; i < workDim; i++) { requestedNumberOfWorkgroups *= workgroupCount[i]; } - size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize); + size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue); if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) { retVal = CL_INVALID_VALUE; return retVal; diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 52271c1d3d..b199ca518e 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1053,14 +1053,19 @@ void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *glob localWorkSize[2] = suggestedLws.z; } -uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const { +uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const { auto &hardwareInfo = getDevice().getHardwareInfo(); + auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); + + if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) { + return 0; + } + auto executionEnvironment = kernelInfo.patchInfo.executionEnvironment; auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount; if (dssCount == 0) { dssCount = hardwareInfo.gtSystemInfo.SubSliceCount; } - auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); auto availableThreadCount = hwHelper.calculateAvailableThreadCount( hardwareInfo.platform.eProductFamily, ((executionEnvironment != nullptr) ? executionEnvironment->NumGRFRequired : GrfConfig::DefaultGrfNumber), diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index 4d9267f168..f5b5e89d27 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -408,7 +408,7 @@ class Kernel : public BaseObject<_cl_kernel> { } void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, size_t *localWorkSize); - uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const; + uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const; uint64_t getKernelStartOffset( const bool localIdsGenerationByRuntime, diff --git a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl index 06724e18c0..1b4cf8d73d 100644 --- a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl +++ b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl @@ -64,7 +64,7 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting retVal = clGetKernelMaxConcurrentWorkGroupCountINTEL(pCommandQueue, pKernel, workDim, globalWorkOffset, localWorkSize, &maxConcurrentWorkGroupCount); EXPECT_EQ(CL_SUCCESS, retVal); - size_t expectedMaxConcurrentWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize); + size_t expectedMaxConcurrentWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue); EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount); std::unique_ptr pKernelWithExecutionEnvironmentPatch(MockKernel::create(pCommandQueue->getDevice(), pProgram)); @@ -72,7 +72,7 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting globalWorkOffset, localWorkSize, &maxConcurrentWorkGroupCount); EXPECT_EQ(CL_SUCCESS, retVal); - expectedMaxConcurrentWorkGroupCount = pKernelWithExecutionEnvironmentPatch->getMaxWorkGroupCount(workDim, localWorkSize); + expectedMaxConcurrentWorkGroupCount = pKernelWithExecutionEnvironmentPatch->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue); EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount); } diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp index 374a0c34aa..f054c52574 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp @@ -214,6 +214,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreSetThenClEnqueueNDCountKernel cl_int retVal = CL_SUCCESS; CommandQueue *pCmdQ2 = createCommandQueue(pClDevice); + HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); + if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext; + } + std::unique_ptr kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal)); EXPECT_EQ(CL_SUCCESS, retVal); @@ -253,6 +258,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenNotAllArgsAreSetButSetKernelArgIsCalled cl_int retVal = CL_SUCCESS; CommandQueue *pCmdQ2 = createCommandQueue(pClDevice); + HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); + if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext; + } + std::unique_ptr kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal)); EXPECT_EQ(CL_SUCCESS, retVal); @@ -292,6 +302,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenSetKernelArgIsCalledForEachArgButAtLeas cl_int retVal = CL_SUCCESS; CommandQueue *pCmdQ2 = createCommandQueue(pClDevice); + HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); + if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext; + } + std::unique_ptr kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal)); EXPECT_EQ(CL_SUCCESS, retVal); diff --git a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp index 6496d0258e..ad4575e22e 100644 --- a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp +++ b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp @@ -25,25 +25,56 @@ class MockSyncBufferHandler : public SyncBufferHandler { using SyncBufferHandler::usedBufferSize; }; -class SyncBufferHandlerTest : public EnqueueHandlerTest { +class SyncBufferEnqueueHandlerTest : public EnqueueHandlerTest { + public: + void SetUp() { + hardwareInfo = *defaultHwInfo; + uint64_t hwInfoConfig = defaultHardwareInfoConfigTable[productFamily]; + hardwareInfoSetup[productFamily](&hardwareInfo, true, hwInfoConfig); + SetUpImpl(&hardwareInfo); + } + + void TearDown() { + context->decRefInternal(); + delete pClDevice; + pClDevice = nullptr; + pDevice = nullptr; + } + + void SetUpImpl(const NEO::HardwareInfo *hardwareInfo) { + pDevice = MockDevice::createWithNewExecutionEnvironment(hardwareInfo); + ASSERT_NE(nullptr, pDevice); + pClDevice = new MockClDevice{pDevice}; + ASSERT_NE(nullptr, pClDevice); + + auto &commandStreamReceiver = pDevice->getGpgpuCommandStreamReceiver(); + pTagMemory = commandStreamReceiver.getTagAddress(); + ASSERT_NE(nullptr, const_cast(pTagMemory)); + + context = new NEO::MockContext(pClDevice); + } +}; + +class SyncBufferHandlerTest : public SyncBufferEnqueueHandlerTest { public: void SetUp() override {} void TearDown() override {} template void SetUpT() { - EnqueueHandlerTest::SetUp(); + SyncBufferEnqueueHandlerTest::SetUp(); kernelInternals = std::make_unique(*pClDevice, context); kernel = kernelInternals->mockKernel; kernel->executionType = KernelExecutionType::Concurrent; commandQueue = reinterpret_cast(new MockCommandQueueHw(context, pClDevice, 0)); + hwHelper = &HwHelper::get(kernel->getDevice().getHardwareInfo().platform.eRenderCoreFamily); } template void TearDownT() { commandQueue->release(); kernelInternals.reset(); - EnqueueHandlerTest::TearDown(); + SyncBufferEnqueueHandlerTest::TearDown(); } void patchAllocateSyncBuffer() { @@ -61,6 +92,10 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest { return clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr); } + bool isCooperativeDispatchSupported() { + return hwHelper->isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), kernel->getDevice().getHardwareInfo().platform.eProductFamily); + } + const cl_uint workDim = 1; const size_t gwOffset[3] = {0, 0, 0}; const size_t lws[3] = {10, 1, 1}; @@ -71,6 +106,7 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest { MockKernel *kernel; MockCommandQueue *commandQueue; SPatchAllocateSyncBuffer sPatchAllocateSyncBuffer; + HwHelper *hwHelper; }; HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenAllocateSyncBufferPatchAndConcurrentKernelWhenEnqueuingKernelThenSyncBufferIsUsed) { @@ -109,7 +145,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenConcurrentKernelWithAllocateSyncB } HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingConcurrentKernelThenSuccessIsReturned) { - auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws); + auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue); workgroupCount[0] = maxWorkGroupCount; globalWorkSize[0] = maxWorkGroupCount * lws[0]; @@ -118,7 +154,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingCon } HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingConcurrentKernelThenErrorIsReturned) { - size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws); + size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue); workgroupCount[0] = maxWorkGroupCount + 1; globalWorkSize[0] = maxWorkGroupCount * lws[0] + 1; diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp index a2d5846118..842c2a0588 100644 --- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp +++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp @@ -24,6 +24,7 @@ #include "opencl/source/gtpin/gtpin_hw_helper.h" #include "opencl/source/gtpin/gtpin_init.h" #include "opencl/source/gtpin/gtpin_notify.h" +#include "opencl/source/helpers/validators.h" #include "opencl/source/kernel/kernel.h" #include "opencl/source/mem_obj/buffer.h" #include "opencl/source/program/create.inl" @@ -913,7 +914,13 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT cl_uint workDim = 1; size_t localWorkSize[3] = {1, 1, 1}; - size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize); + CommandQueue *commandQueue = nullptr; + WithCastToInternal(cmdQ, &commandQueue); + HwHelper &hwHelper = HwHelper::get(pDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily); + if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), pDevice->getDevice().getHardwareInfo().platform.eProductFamily)) { + commandQueue->getGpgpuEngine().osContext = commandQueue->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext; + } + size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize, commandQueue); auto buff10 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); auto buff11 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 922b908f19..9eb6b8e6ba 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -126,6 +126,7 @@ class HwHelper { virtual bool useOnlyGlobalTimestamps() const = 0; virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0; virtual bool packedFormatsSupported() const = 0; + virtual bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const = 0; static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo); static uint32_t getEnginesCount(const HardwareInfo &hwInfo); @@ -308,6 +309,8 @@ class HwHelperHw : public HwHelper { bool packedFormatsSupported() const override; + bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const override; + protected: LocalMemoryAccessMode getDefaultLocalMemoryAccessMode(const HardwareInfo &hwInfo) const override; diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index e5f752a963..008552a8e1 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -504,4 +504,9 @@ bool MemorySynchronizationCommands::isPipeControlPriorToPipelineSelec return false; } +template +inline bool NEO::HwHelperHw::isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const { + return true; +} + } // namespace NEO