Add adjustMaxWorkGroupCount helper

Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
Filip Hazubski
2021-08-23 18:02:53 +00:00
committed by Compute-Runtime-Automation
parent 8d60fb2a07
commit de1e4e0074
9 changed files with 34 additions and 15 deletions

View File

@@ -111,7 +111,7 @@ ZE_APIEXPORT ze_result_t ZE_APICALL
zeKernelSuggestMaxCooperativeGroupCount( zeKernelSuggestMaxCooperativeGroupCount(
ze_kernel_handle_t hKernel, ze_kernel_handle_t hKernel,
uint32_t *totalGroupCount) { uint32_t *totalGroupCount) {
return L0::Kernel::fromHandle(hKernel)->suggestMaxCooperativeGroupCount(totalGroupCount); return L0::Kernel::fromHandle(hKernel)->suggestMaxCooperativeGroupCount(totalGroupCount, NEO::EngineGroupType::Compute, false);
} }
ZE_APIEXPORT ze_result_t ZE_APICALL ZE_APIEXPORT ze_result_t ZE_APICALL

View File

@@ -1880,7 +1880,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kern
} }
uint32_t maximalNumberOfWorkgroupsAllowed; uint32_t maximalNumberOfWorkgroupsAllowed;
auto ret = kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed); auto ret = kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed, this->engineGroupType,
device.getDefaultEngine().osContext->isEngineInstanced());
UNRECOVERABLE_IF(ret != ZE_RESULT_SUCCESS); UNRECOVERABLE_IF(ret != ZE_RESULT_SUCCESS);
size_t requestedNumberOfWorkgroups = (pThreadGroupDimensions->groupCountX * pThreadGroupDimensions->groupCountY * size_t requestedNumberOfWorkgroups = (pThreadGroupDimensions->groupCountX * pThreadGroupDimensions->groupCountY *
pThreadGroupDimensions->groupCountZ); pThreadGroupDimensions->groupCountZ);

View File

@@ -118,7 +118,8 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
virtual void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) = 0; virtual void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) = 0;
virtual ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) = 0; virtual ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType,
bool isEngineInstanced) = 0;
virtual ze_result_t setCacheConfig(ze_cache_config_flags_t flags) = 0; virtual ze_result_t setCacheConfig(ze_cache_config_flags_t flags) = 0;
virtual ze_result_t getProfileInfo(zet_profile_properties_t *pProfileProperties) = 0; virtual ze_result_t getProfileInfo(zet_profile_properties_t *pProfileProperties) = 0;

View File

@@ -393,7 +393,8 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
return ZE_RESULT_SUCCESS; return ZE_RESULT_SUCCESS;
} }
ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) { ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType,
bool isEngineInstanced) {
UNRECOVERABLE_IF(0 == groupSize[0]); UNRECOVERABLE_IF(0 == groupSize[0]);
UNRECOVERABLE_IF(0 == groupSize[1]); UNRECOVERABLE_IF(0 == groupSize[1]);
UNRECOVERABLE_IF(0 == groupSize[2]); UNRECOVERABLE_IF(0 == groupSize[2]);
@@ -423,6 +424,7 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount
hwHelper.getBarriersCountFromHasBarriers(barrierCount), hwHelper.getBarriersCountFromHasBarriers(barrierCount),
workDim, workDim,
localWorkSize); localWorkSize);
*totalGroupCount = hwHelper.adjustMaxWorkGroupCount(*totalGroupCount, engineGroupType, hardwareInfo, isEngineInstanced);
return ZE_RESULT_SUCCESS; return ZE_RESULT_SUCCESS;
} }

View File

@@ -45,7 +45,8 @@ struct KernelImp : Kernel {
ze_result_t getKernelName(size_t *pSize, char *pName) override; ze_result_t getKernelName(size_t *pSize, char *pName) override;
ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) override; ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType,
bool isEngineInstanced) override;
const uint8_t *getCrossThreadData() const override { return crossThreadData.get(); } const uint8_t *getCrossThreadData() const override { return crossThreadData.get(); }
uint32_t getCrossThreadDataSize() const override { return crossThreadDataSize; } uint32_t getCrossThreadDataSize() const override { return crossThreadDataSize; }

View File

@@ -1038,7 +1038,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
{ {
VariableBackup<uint32_t> groupCountX{&groupCount.groupCountX}; VariableBackup<uint32_t> groupCountX{&groupCount.groupCountX};
uint32_t maximalNumberOfWorkgroupsAllowed; uint32_t maximalNumberOfWorkgroupsAllowed;
kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed); kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed, NEO::EngineGroupType::Compute, false);
groupCountX = maximalNumberOfWorkgroupsAllowed + 1; groupCountX = maximalNumberOfWorkgroupsAllowed + 1;
pCommandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>(); pCommandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
pCommandList->initialize(device, NEO::EngineGroupType::Compute, 0u); pCommandList->initialize(device, NEO::EngineGroupType::Compute, 0u);

View File

@@ -1056,15 +1056,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount);
auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount; auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount;
return KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(), auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(),
availableThreadCount, availableThreadCount,
dssCount, dssCount,
dssCount * KB * hardwareInfo.capabilityTable.slmSize, dssCount * KB * hardwareInfo.capabilityTable.slmSize,
hwHelper.alignSlmSize(slmTotalSize), hwHelper.alignSlmSize(slmTotalSize),
static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()), static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
hwHelper.getBarriersCountFromHasBarriers(barrierCount), hwHelper.getBarriersCountFromHasBarriers(barrierCount),
workDim, workDim,
localWorkSize); localWorkSize);
auto isEngineInstanced = commandQueue->getCommandStreamReceiver(false).getOsContext().isEngineInstanced();
maxWorkGroupCount = hwHelper.adjustMaxWorkGroupCount(maxWorkGroupCount, engineGroupType, hardwareInfo, isEngineInstanced);
return maxWorkGroupCount;
} }
inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) { inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) {

View File

@@ -126,6 +126,8 @@ class HwHelper {
virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0; virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0;
virtual bool packedFormatsSupported() const = 0; virtual bool packedFormatsSupported() const = 0;
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType) const = 0; virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType) const = 0;
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const HardwareInfo &hwInfo, bool isEngineInstanced) const = 0;
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0; virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
virtual bool isCopyOnlyEngineType(EngineGroupType type) const = 0; virtual bool isCopyOnlyEngineType(EngineGroupType type) const = 0;
virtual bool isSipWANeeded(const HardwareInfo &hwInfo) const = 0; virtual bool isSipWANeeded(const HardwareInfo &hwInfo) const = 0;
@@ -334,6 +336,9 @@ class HwHelperHw : public HwHelper {
bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType) const override; bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType) const override;
uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const HardwareInfo &hwInfo, bool isEngineInstanced) const override;
size_t getMaxFillPaternSizeForCopyEngine() const override; size_t getMaxFillPaternSizeForCopyEngine() const override;
bool isKmdMigrationSupported(const HardwareInfo &hwInfo) const override; bool isKmdMigrationSupported(const HardwareInfo &hwInfo) const override;

View File

@@ -596,6 +596,12 @@ bool HwHelperHw<GfxFamily>::isCooperativeDispatchSupported(const EngineGroupType
return true; return true;
} }
template <typename GfxFamily>
uint32_t HwHelperHw<GfxFamily>::adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const HardwareInfo &hwInfo, bool isEngineInstanced) const {
return maxWorkGroupCount;
}
template <typename GfxFamily> template <typename GfxFamily>
bool HwHelperHw<GfxFamily>::isKmdMigrationSupported(const HardwareInfo &hwInfo) const { bool HwHelperHw<GfxFamily>::isKmdMigrationSupported(const HardwareInfo &hwInfo) const {
return false; return false;