From de1e4e00748dfdee04d89ce813d61de319a0d309 Mon Sep 17 00:00:00 2001 From: Filip Hazubski Date: Mon, 23 Aug 2021 18:02:53 +0000 Subject: [PATCH] Add adjustMaxWorkGroupCount helper Signed-off-by: Filip Hazubski --- level_zero/api/core/ze_module.cpp | 2 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 3 ++- level_zero/core/source/kernel/kernel.h | 3 ++- level_zero/core/source/kernel/kernel_imp.cpp | 4 +++- level_zero/core/source/kernel/kernel_imp.h | 3 ++- .../test_cmdlist_append_launch_kernel_1.cpp | 2 +- opencl/source/kernel/kernel.cpp | 21 +++++++++++-------- shared/source/helpers/hw_helper.h | 5 +++++ shared/source/helpers/hw_helper_base.inl | 6 ++++++ 9 files changed, 34 insertions(+), 15 deletions(-) diff --git a/level_zero/api/core/ze_module.cpp b/level_zero/api/core/ze_module.cpp index 95ca1e9af2..2f20892fa8 100644 --- a/level_zero/api/core/ze_module.cpp +++ b/level_zero/api/core/ze_module.cpp @@ -111,7 +111,7 @@ ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelSuggestMaxCooperativeGroupCount( ze_kernel_handle_t hKernel, uint32_t *totalGroupCount) { - return L0::Kernel::fromHandle(hKernel)->suggestMaxCooperativeGroupCount(totalGroupCount); + return L0::Kernel::fromHandle(hKernel)->suggestMaxCooperativeGroupCount(totalGroupCount, NEO::EngineGroupType::Compute, false); } ZE_APIEXPORT ze_result_t ZE_APICALL diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 69f1ff6e6d..06b6850c7f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1880,7 +1880,8 @@ ze_result_t CommandListCoreFamily::programSyncBuffer(Kernel &kern } uint32_t maximalNumberOfWorkgroupsAllowed; - auto ret = kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed); + auto ret = kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed, this->engineGroupType, + device.getDefaultEngine().osContext->isEngineInstanced()); UNRECOVERABLE_IF(ret != ZE_RESULT_SUCCESS); size_t requestedNumberOfWorkgroups = (pThreadGroupDimensions->groupCountX * pThreadGroupDimensions->groupCountY * pThreadGroupDimensions->groupCountZ); diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h index 0bbdd97f99..037d9a9815 100644 --- a/level_zero/core/source/kernel/kernel.h +++ b/level_zero/core/source/kernel/kernel.h @@ -118,7 +118,8 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI { virtual void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) = 0; - virtual ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) = 0; + virtual ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType, + bool isEngineInstanced) = 0; virtual ze_result_t setCacheConfig(ze_cache_config_flags_t flags) = 0; virtual ze_result_t getProfileInfo(zet_profile_properties_t *pProfileProperties) = 0; diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index e3a6517d1b..c91d079986 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -393,7 +393,8 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz return ZE_RESULT_SUCCESS; } -ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) { +ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType, + bool isEngineInstanced) { UNRECOVERABLE_IF(0 == groupSize[0]); UNRECOVERABLE_IF(0 == groupSize[1]); UNRECOVERABLE_IF(0 == groupSize[2]); @@ -423,6 +424,7 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount hwHelper.getBarriersCountFromHasBarriers(barrierCount), workDim, localWorkSize); + *totalGroupCount = hwHelper.adjustMaxWorkGroupCount(*totalGroupCount, engineGroupType, hardwareInfo, isEngineInstanced); return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 79ffb53444..72868b5a18 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -45,7 +45,8 @@ struct KernelImp : Kernel { ze_result_t getKernelName(size_t *pSize, char *pName) override; - ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) override; + ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType, + bool isEngineInstanced) override; const uint8_t *getCrossThreadData() const override { return crossThreadData.get(); } uint32_t getCrossThreadDataSize() const override { return crossThreadDataSize; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp index 1e923a23a7..c3bc096f2a 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp @@ -1038,7 +1038,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau { VariableBackup groupCountX{&groupCount.groupCountX}; uint32_t maximalNumberOfWorkgroupsAllowed; - kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed); + kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed, NEO::EngineGroupType::Compute, false); groupCountX = maximalNumberOfWorkgroupsAllowed + 1; pCommandList = std::make_unique>>(); pCommandList->initialize(device, NEO::EngineGroupType::Compute, 0u); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 2d87b407fe..8b706ed978 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1056,15 +1056,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount; - return KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(), - availableThreadCount, - dssCount, - dssCount * KB * hardwareInfo.capabilityTable.slmSize, - hwHelper.alignSlmSize(slmTotalSize), - static_cast(hwHelper.getMaxBarrierRegisterPerSlice()), - hwHelper.getBarriersCountFromHasBarriers(barrierCount), - workDim, - localWorkSize); + auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(), + availableThreadCount, + dssCount, + dssCount * KB * hardwareInfo.capabilityTable.slmSize, + hwHelper.alignSlmSize(slmTotalSize), + static_cast(hwHelper.getMaxBarrierRegisterPerSlice()), + hwHelper.getBarriersCountFromHasBarriers(barrierCount), + workDim, + localWorkSize); + auto isEngineInstanced = commandQueue->getCommandStreamReceiver(false).getOsContext().isEngineInstanced(); + maxWorkGroupCount = hwHelper.adjustMaxWorkGroupCount(maxWorkGroupCount, engineGroupType, hardwareInfo, isEngineInstanced); + return maxWorkGroupCount; } inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) { diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index b1a09c4cfe..7e105f338b 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -126,6 +126,8 @@ class HwHelper { virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0; virtual bool packedFormatsSupported() const = 0; virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType) const = 0; + virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, + const HardwareInfo &hwInfo, bool isEngineInstanced) const = 0; virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0; virtual bool isCopyOnlyEngineType(EngineGroupType type) const = 0; virtual bool isSipWANeeded(const HardwareInfo &hwInfo) const = 0; @@ -334,6 +336,9 @@ class HwHelperHw : public HwHelper { bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType) const override; + uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, + const HardwareInfo &hwInfo, bool isEngineInstanced) const override; + size_t getMaxFillPaternSizeForCopyEngine() const override; bool isKmdMigrationSupported(const HardwareInfo &hwInfo) const override; diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index 199ddffff2..8ca929a2c0 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -596,6 +596,12 @@ bool HwHelperHw::isCooperativeDispatchSupported(const EngineGroupType return true; } +template +uint32_t HwHelperHw::adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, + const HardwareInfo &hwInfo, bool isEngineInstanced) const { + return maxWorkGroupCount; +} + template bool HwHelperHw::isKmdMigrationSupported(const HardwareInfo &hwInfo) const { return false;