mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 01:35:20 +08:00
fix: to always use grfs count in calculateNumThreadsPerThreadGroup
grf size != grf count Related-To: GSD-8437 Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
df54d67f40
commit
da7b03dd15
@@ -137,7 +137,7 @@ struct EncodeDispatchKernel {
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData,
|
||||
static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData,
|
||||
const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
static void *getInterfaceDescriptor(CommandContainer &container, IndirectHeap *childDsh, uint32_t &iddOffset);
|
||||
@@ -155,10 +155,10 @@ struct EncodeDispatchKernel {
|
||||
static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
|
||||
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
|
||||
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
|
||||
static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
|
||||
|
||||
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
|
||||
|
||||
|
||||
@@ -729,7 +729,7 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {}
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
|
||||
@@ -759,7 +759,7 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
@@ -777,7 +777,7 @@ void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispat
|
||||
|
||||
if (algorithmVersion == 2) {
|
||||
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
|
||||
if (numGrf == 256) {
|
||||
if (grfCount == 256) {
|
||||
threadsPerXeCore /= 2;
|
||||
}
|
||||
auto tgDispatchSizeSelected = 8;
|
||||
@@ -819,11 +819,11 @@ void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispat
|
||||
}
|
||||
} else {
|
||||
if (adjustTGDispatchSize) {
|
||||
UNRECOVERABLE_IF(numGrf == 0u);
|
||||
UNRECOVERABLE_IF(grfCount == 0u);
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
|
||||
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount);
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
|
||||
const uint32_t tilesCount = device.getNumSubDevices();
|
||||
availableThreadCount *= tilesCount;
|
||||
|
||||
@@ -31,7 +31,7 @@ namespace NEO {
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
auto grfSize = sizeof(typename Family::GRF);
|
||||
|
||||
@@ -14,9 +14,9 @@ template void NEO::EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Fa
|
||||
template void NEO::EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForRegularEvent<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForInOrderExec<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void NEO::EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::DefaultWalkerType &walkerCmd);
|
||||
template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, Family::DefaultWalkerType &walkerCmd);
|
||||
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerType>(CommandContainer &container, EncodeDispatchKernelArgs &args);
|
||||
template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
@@ -41,7 +41,7 @@ constexpr size_t immWriteDestinationAddressAlignment = 8;
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
}
|
||||
@@ -377,7 +377,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, walkerCmd);
|
||||
if (debugManager.flags.PrintKernelDispatchParameters.get()) {
|
||||
fprintf(stdout, "kernel, %s, numGrf, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
|
||||
fprintf(stdout, "kernel, %s, grfCount, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
|
||||
kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
kernelDescriptor.kernelAttributes.numGrfRequired,
|
||||
kernelDescriptor.kernelAttributes.simdSize,
|
||||
|
||||
Reference in New Issue
Block a user